/* **********************************************************
 * Copyright (c) 2011-2014 Google, Inc.  All rights reserved.
 * Copyright (c) 2000-2010 VMware, Inc.  All rights reserved.
 * **********************************************************/

/*
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * * Redistributions of source code must retain the above copyright notice,
 *   this list of conditions and the following disclaimer.
 *
 * * Redistributions in binary form must reproduce the above copyright notice,
 *   this list of conditions and the following disclaimer in the documentation
 *   and/or other materials provided with the distribution.
 *
 * * Neither the name of VMware, Inc. nor the names of its contributors may be
 *   used to endorse or promote products derived from this software without
 *   specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL VMWARE, INC. OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
 * DAMAGE.
 */

/* Copyright (c) 2003-2007 Determina Corp. */
/* Copyright (c) 2001-2003 Massachusetts Institute of Technology */
/* Copyright (c) 2000-2001 Hewlett-Packard Company */

/*
 * signal.c - dynamorio signal handler
 */

#include <unistd.h>
#include <errno.h>
#undef errno

#include "signal_private.h" /* pulls in globals.h for us, in right order */

#ifdef LINUX
/* We want to build on older toolchains so we have our own copy of signal
 * data structures
 */
#  include "include/sigcontext.h"
#  include "include/signalfd.h"
#  include "../globals.h" /* after our sigcontext.h, to preclude bits/sigcontext.h */
#elif defined(MACOS)
#  include "../globals.h" /* this defines _XOPEN_SOURCE for Mac */
#  include <signal.h> /* after globals.h, for _XOPEN_SOURCE from os_exports.h */
#endif

#ifdef LINUX
#  include <linux/sched.h>
#endif

#include <sys/time.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <ucontext.h>
#include <string.h> /* for memcpy and memset */
#include "os_private.h"
#include "../fragment.h"
#include "../fcache.h"
#include "../perfctr.h"
#include "arch.h"
#include "../monitor.h" /* for trace_abort */
#include "../link.h" /* for linking interrupted fragment_t */
#include "instr.h" /* to find target of SIGSEGV */
#include "decode.h" /* to find target of SIGSEGV */
#include "decode_fast.h" /* to handle self-mod code */
#include "../synch.h"
#include "../nudge.h"
#include "disassemble.h"
#include "ksynch.h"
#include "tls.h" /* tls_reinstate_selector */
#include "../translate.h"

#ifdef LINUX
# include "include/syscall.h"
#else
# include <sys/syscall.h>
#endif

#ifdef CLIENT_INTERFACE
# include "instrument.h"
#endif

#ifdef VMX86_SERVER
# include <errno.h>
#endif

#ifdef MACOS
/* Define the Linux names, which the code is already using */
#  define SA_NOMASK       SA_NODEFER
#  define SA_ONESHOT      SA_RESETHAND
#endif

/**** data structures ***************************************************/

/* The signal numbers are slightly different between operating systems.
 * To support differing default actions, we have separate arrays, rather
 * than indirecting to a single all-signals array.
 */
extern int default_action[];

/* We know that many signals are always asynchronous.
 * Others, however, may be synchronous or may not -- e.g., another process
 * could send us a SIGSEGV, and there is no way we can tell whether it
 * was generated by a real memory fault or not.  Thus we have to assume
 * that we must not delay any SIGSEGV deliveries.
 */
extern bool can_always_delay[];

static inline bool
sig_is_alarm_signal(int sig)
{
    return (sig == SIGALRM || sig == SIGVTALRM || sig == SIGPROF);
}

/* we do not use SIGSTKSZ b/c for things like code modification
 * we end up calling many core routines and so want more space
 * (though currently non-debug stack size == SIGSTKSZ (8KB))
 */
/* this size is assumed in heap.c's threadunits_exit leak relaxation */
#define SIGSTACK_SIZE DYNAMORIO_STACK_SIZE

/* this flag not defined in our headers */
#define SA_RESTORER 0x04000000

/* if no app sigaction, it's RT, since that's our handler */
#ifdef LINUX
#  define IS_RT_FOR_APP(info, sig) \
  IF_X64_ELSE(true, ((info)->app_sigaction[(sig)] == NULL ? true : \
                     (TEST(SA_SIGINFO, (info)->app_sigaction[(sig)]->flags))))
#elif defined(MACOS)
#  define IS_RT_FOR_APP(info, sig) (true)
#endif

/* kernel sets size and sp to 0 for SS_DISABLE
 * when asked, will hand back SS_ONSTACK only if current xsp is inside the
 * alt stack; otherwise, if an alt stack is registered, it will give flags of 0
 * We do not support the "legacy stack switching" that uses the restorer field
 * as seen in kernel sources.
 */
#define APP_HAS_SIGSTACK(info) \
  ((info)->app_sigstack.ss_sp != NULL && (info)->app_sigstack.ss_flags != SS_DISABLE)

/* Extra space needed to put the signal frame on the app stack.  We include the
 * size of the extra padding potentially needed to align these structs.  We
 * assume the stack pointer is 4-aligned already, so we over estimate padding
 * size by the alignment minus 4.
 */
#ifdef LINUX
/* An extra 4 for trailing FP_XSTATE_MAGIC2 */
#  define AVX_FRAME_EXTRA (sizeof(struct _xstate) + AVX_ALIGNMENT - 4 + 4)
#  define FPSTATE_FRAME_EXTRA (sizeof(struct _fpstate) + FPSTATE_ALIGNMENT - 4)
#  define XSTATE_FRAME_EXTRA (YMM_ENABLED() ? AVX_FRAME_EXTRA : FPSTATE_FRAME_EXTRA)

#  define AVX_DATA_SIZE (sizeof(struct _xstate) + 4)
#  define FPSTATE_DATA_SIZE (sizeof(struct _fpstate))
#  define XSTATE_DATA_SIZE (YMM_ENABLED() ? AVX_DATA_SIZE : FPSTATE_DATA_SIZE)

#elif defined(MACOS)
/* Currently assuming __darwin_mcontext_avx{32,64} is always used in the
 * frame.  If instead __darwin_mcontext{32,64} is used (w/ just float and no AVX)
 * on, say, older machines or OSX versions, we'll have to revisit this.
 */
#  define AVX_FRAME_EXTRA 0
#  define FPSTATE_FRAME_EXTRA 0
#  define XSTATE_FRAME_EXTRA 0
#  define AVX_DATA_SIZE 0
#  define FPSTATE_DATA_SIZE 0
#  define XSTATE_DATA_SIZE 0
#endif

/* If we only intercept a few signals, we leave whether un-intercepted signals
 * are blocked unchanged and stored in the kernel.  If we intercept all (not
 * quite yet: PR 297033, hence the need for this macro) we emulate the mask for
 * all.
 */
#define EMULATE_SIGMASK(info, sig) \
    (DYNAMO_OPTION(intercept_all_signals) || (info)->we_intercept[(sig)])

/* i#27: custom data to pass to the child of a clone */
/* PR i#149/403015: clone record now passed via a new dstack */
typedef struct _clone_record_t {
    byte *dstack;          /* dstack for new thread - allocated by parent thread */
#ifdef MACOS
    /* XXX i#1403: once we have lower-level, earlier thread interception we can
     * likely switch to something closer to what we do on Linux.
     * This is used for bsdthread_create, where app_thread_xsp is NULL;
     * for vfork, app_thread_xsp is non-NULL and this is unused.
     */
    void *thread_arg;
#endif
    reg_t app_thread_xsp;  /* app xsp preserved for new thread to use */
    app_pc continuation_pc;
    thread_id_t caller_id;
    int clone_sysnum;
    uint clone_flags;
    thread_sig_info_t info;
    thread_sig_info_t *parent_info;
    void *pcprofile_info;
    /* we leave some padding at base of stack for dynamorio_clone
     * to store values
     */
    reg_t for_dynamorio_clone[4];
} clone_record_t;

/* i#350: set up signal handler for safe_read/faults during init */
static thread_sig_info_t init_info;
static kernel_sigset_t init_sigmask;

#ifdef DEBUG
static bool removed_sig_handler;
#endif

/**** function prototypes ***********************************************/

/* in x86.asm */
void
master_signal_handler(int sig, siginfo_t *siginfo, kernel_ucontext_t *ucxt);

static void
intercept_signal(dcontext_t *dcontext, thread_sig_info_t *info, int sig);

static void
signal_info_init_sigaction(dcontext_t *dcontext, thread_sig_info_t *info);

static void
signal_info_exit_sigaction(dcontext_t *dcontext, thread_sig_info_t *info,
                           bool other_thread);

static bool
execute_handler_from_cache(dcontext_t *dcontext, int sig, sigframe_rt_t *our_frame,
                           sigcontext_t *sc_orig, fragment_t *f
                           _IF_CLIENT(byte *access_address));

static bool
execute_handler_from_dispatch(dcontext_t *dcontext, int sig);

/* Execute default action from code cache and may terminate the process.
 * If returns, the return value decides if caller should restore
 * the untranslated context.
 */
static bool
execute_default_from_cache(dcontext_t *dcontext, int sig, sigframe_rt_t *frame,
                           sigcontext_t *sc_orig);

static void
execute_default_from_dispatch(dcontext_t *dcontext, int sig, sigframe_rt_t *frame);

static bool
handle_alarm(dcontext_t *dcontext, int sig, kernel_ucontext_t *ucxt);

static bool
handle_suspend_signal(dcontext_t *dcontext, kernel_ucontext_t *ucxt);

static bool
handle_nudge_signal(dcontext_t *dcontext, siginfo_t *siginfo, kernel_ucontext_t *ucxt);

static void
init_itimer(dcontext_t *dcontext, bool first);

static bool
set_actual_itimer(dcontext_t *dcontext, int which, thread_sig_info_t *info,
                  bool enable);

#ifdef DEBUG
static void
dump_sigset(dcontext_t *dcontext, kernel_sigset_t *set);
#endif

static bool
is_sys_kill(dcontext_t *dcontext, byte *pc, byte *xsp, siginfo_t *info);

static inline int
sigaction_syscall(int sig, kernel_sigaction_t *act, kernel_sigaction_t *oact)
{
#if defined(X64) && !defined(VMX86_SERVER) && defined(LINUX)
    /* PR 305020: must have SA_RESTORER for x64 */
    if (act != NULL && !TEST(SA_RESTORER, act->flags)) {
        act->flags |= SA_RESTORER;
        act->restorer = (void (*)(void)) dynamorio_sigreturn;
    }
#endif
    return dynamorio_syscall(IF_MACOS_ELSE(SYS_sigaction,SYS_rt_sigaction),
                             4, sig, act, oact, sizeof(kernel_sigset_t));
}

static inline int
sigaltstack_syscall(const stack_t *newstack, stack_t *oldstack)
{
    return dynamorio_syscall(SYS_sigaltstack, 2, newstack, oldstack);
}

static inline int
getitimer_syscall(int which, struct itimerval *val)
{
    return dynamorio_syscall(SYS_getitimer, 2, which, val);
}

static inline int
setitimer_syscall(int which, struct itimerval *val, struct itimerval *old)
{
    return dynamorio_syscall(SYS_setitimer, 3, which, val, old);
}

static inline int
sigprocmask_syscall(int how, kernel_sigset_t *set, kernel_sigset_t *oset,
                    size_t sigsetsize)
{
    return dynamorio_syscall(IF_MACOS_ELSE(SYS_sigprocmask,SYS_rt_sigprocmask),
                             4, how, set, oset, sigsetsize);
}

static void
unblock_all_signals(kernel_sigset_t *oset)
{
    kernel_sigset_t set;
    kernel_sigemptyset(&set);
    sigprocmask_syscall(SIG_SETMASK, &set, oset, sizeof(set));
}

/* exported for stackdump.c */
bool
set_default_signal_action(int sig)
{
    kernel_sigset_t set;
    kernel_sigaction_t act;
    int rc;
    memset(&act, 0, sizeof(act));
    act.handler = (handler_t) SIG_DFL;
    /* arm the signal */
    rc = sigaction_syscall(sig, &act, NULL);
    DODEBUG({ removed_sig_handler = true; });

    /* If we're in our handler now, we have to unblock */
    kernel_sigemptyset(&set);
    kernel_sigaddset(&set, sig);
    sigprocmask_syscall(SIG_UNBLOCK, &set, NULL, sizeof(set));

    return (rc == 0);
}

/* We assume that signal handlers will be shared most of the time
 * (pthreads shares them)
 * Rather than start out with the handler table in local memory and then
 * having to transfer to global, we just always use global
 */
static void
handler_free(dcontext_t *dcontext, void *p, size_t size)
{
    global_heap_free(p, size HEAPACCT(ACCT_OTHER));
}

static void *
handler_alloc(dcontext_t *dcontext, size_t size)
{
    return global_heap_alloc(size HEAPACCT(ACCT_OTHER));
}

/**** top-level routines ***********************************************/

static bool
os_itimers_thread_shared(void)
{
    static bool itimers_shared;
    static bool cached = false;
    if (!cached) {
        file_t f = os_open("/proc/version", OS_OPEN_READ);
        if (f != INVALID_FILE) {
            char buf[128];
            int major, minor, rel;
            os_read(f, buf, BUFFER_SIZE_ELEMENTS(buf));
            NULL_TERMINATE_BUFFER(buf);
            if (sscanf(buf, "%*s %*s %d.%d.%d", &major, &minor, &rel) == 3) {
                /* Linux NPTL in kernel 2.6.12+ has POSIX-style itimers shared
                 * among threads.
                 */
                LOG(GLOBAL, LOG_ASYNCH, 1, "kernel version = %d.%d.%d\n",
                    major, minor, rel);
                itimers_shared = ((major == 2 && minor >= 6 && rel >= 12) ||
                                  (major >= 3 /* linux-3.0 or above */));
                cached = true;
            }
            os_close(f);
        }
        if (!cached) {
            /* assume not shared */
            itimers_shared = false;
            cached = true;
        }
        LOG(GLOBAL, LOG_ASYNCH, 1, "itimers are %s\n",
            itimers_shared ? "thread-shared" : "thread-private");
    }
    return itimers_shared;
}

void
signal_init()
{
    IF_LINUX(IF_X64(ASSERT(ALIGNED(offsetof(sigpending_t, xstate), AVX_ALIGNMENT))));
    IF_MACOS(ASSERT(sizeof(kernel_sigset_t) == sizeof(__darwin_sigset_t)));
    os_itimers_thread_shared();

    /* Set up a handler for safe_read (or other fault detection) during
     * DR init before thread is initialized.
     *
     * XXX: could set up a clone_record_t and pass to the initial
     * signal_thread_inherit() but that would require further code changes.
     * Could also call signal_thread_inherit to init this, but we don't want
     * to intercept timer signals, etc. before we're ready to handle them,
     * so we do a partial init.
     */
    signal_info_init_sigaction(GLOBAL_DCONTEXT, &init_info);
    intercept_signal(GLOBAL_DCONTEXT, &init_info, SIGSEGV);
    intercept_signal(GLOBAL_DCONTEXT, &init_info, SIGBUS);
    unblock_all_signals(&init_sigmask);

    IF_LINUX(signalfd_init());
}

void
signal_exit()
{
    IF_LINUX(signalfd_exit());
#ifdef DEBUG
    if (stats->loglevel > 0 && (stats->logmask & (LOG_ASYNCH|LOG_STATS)) != 0) {
        LOG(GLOBAL, LOG_ASYNCH|LOG_STATS, 1,
            "Total signals delivered: %d\n", GLOBAL_STAT(num_signals));
    }
#endif
}

void
signal_thread_init(dcontext_t *dcontext)
{
#ifdef HAVE_SIGALTSTACK
    int rc;
#endif
    thread_sig_info_t *info = HEAP_TYPE_ALLOC(dcontext, thread_sig_info_t,
                                              ACCT_OTHER, PROTECTED);

    /* all fields want to be initialized to 0 */
    memset(info, 0, sizeof(thread_sig_info_t));
    dcontext->signal_field = (void *) info;

    /* our special heap to avoid reentrancy problems
     * composed entirely of sigpending_t units
     * Note that it's fine to have the special heap do page-at-a-time
     * committing, which does not use locks (unless triggers reset!),
     * but if we need a new unit that will grab a lock: we try to
     * avoid that by limiting the # of pending alarm signals (PR 596768).
     */
    info->sigheap = special_heap_init(sizeof(sigpending_t),
                                      false /* cannot have any locking */,
                                      false /* -x */,
                                      true /* persistent */);

#ifdef HAVE_SIGALTSTACK
    /* set up alternate stack
     * i#552 we may terminate the process without freeing the stack, so we
     * stack_alloc it to exempt from the memory leak check.
     */
    info->sigstack.ss_sp = (char *) stack_alloc(SIGSTACK_SIZE) - SIGSTACK_SIZE;
    info->sigstack.ss_size = SIGSTACK_SIZE;
    /* kernel will set xsp to sp+size to grow down from there, we don't have to */
    info->sigstack.ss_flags = 0;
    rc = sigaltstack_syscall(&info->sigstack, &info->app_sigstack);
    ASSERT(rc == 0);
    LOG(THREAD, LOG_ASYNCH, 1, "signal stack is "PFX" - "PFX"\n",
        info->sigstack.ss_sp, info->sigstack.ss_sp + info->sigstack.ss_size);
    /* app_sigstack dealt with below, based on parentage */
#endif

    kernel_sigemptyset(&info->app_sigblocked);

    ASSIGN_INIT_LOCK_FREE(info->child_lock, child_lock);

    /* someone must call signal_thread_inherit() to finish initialization:
     * for first thread, called from initial setup; else, from new_thread_setup
     * or share_siginfo_after_take_over.
     */
}

/* i#27: create custom data to pass to the child of a clone
 * since we can't rely on being able to find the caller, or that
 * its syscall data is still valid, once in the child.
 *
 * i#149/ PR 403015: The clone record is passed to the new thread via the dstack
 * created for it.  Unlike before, where the child thread would create its own
 * dstack, now the parent thread creates the dstack.  Also, switches app stack
 * to dstack.
 *
 * XXX i#1403: for Mac we want to eventually do lower-level earlier interception
 * of threads, but for now we're later and higher-level, intercepting the user
 * thread function on the new thread's stack.  We ignore app_thread_xsp.
 */
void *
#ifdef MACOS
create_clone_record(dcontext_t *dcontext, reg_t *app_thread_xsp,
                    app_pc thread_func, void *thread_arg)
#else
create_clone_record(dcontext_t *dcontext, reg_t *app_thread_xsp)
#endif
{
    clone_record_t *record;
    byte *dstack = stack_alloc(DYNAMORIO_STACK_SIZE);
    LOG(THREAD, LOG_ASYNCH, 1,
        "create_clone_record: dstack for new thread is "PFX"\n", dstack);

#ifdef MACOS
    if (app_thread_xsp == NULL) {
        record = HEAP_TYPE_ALLOC(GLOBAL_DCONTEXT, clone_record_t,
                                 ACCT_THREAD_MGT, true/*prot*/);
        record->app_thread_xsp = 0;
        record->continuation_pc = thread_func;
        record->thread_arg = thread_arg;
        record->clone_flags = CLONE_THREAD | CLONE_VM | CLONE_SIGHAND | SIGCHLD;
    } else {
#endif
        /* Note, the stack grows to low memory addr, so dstack points to the high
         * end of the allocated stack region.  So, we must subtract to get space for
         * the clone record.
         */
        record = (clone_record_t *) (dstack - sizeof(clone_record_t));
        record->app_thread_xsp = *app_thread_xsp;
        /* asynch_target is set in dispatch() prior to calling pre_system_call(). */
        record->continuation_pc = dcontext->asynch_target;
        record->clone_flags = dcontext->sys_param0;
#ifdef MACOS
    }
#endif
    LOG(THREAD, LOG_ASYNCH, 1, "allocated clone record: "PFX"\n", record);

    record->dstack = dstack;
    record->caller_id = dcontext->owning_thread;
    record->clone_sysnum = dcontext->sys_num;
    record->info = *((thread_sig_info_t *)dcontext->signal_field);
    record->parent_info = (thread_sig_info_t *) dcontext->signal_field;
    record->pcprofile_info = dcontext->pcprofile_field;
    LOG(THREAD, LOG_ASYNCH, 1,
        "create_clone_record: thread "TIDFMT", pc "PFX"\n",
        record->caller_id, record->continuation_pc);

#ifdef X86
# ifdef MACOS
    if (app_thread_xsp != NULL) {
# endif
        /* Set the thread stack to point to the dstack, below the clone record.
         * Note: it's glibc who sets up the arg to the thread start function;
         * the kernel just does a fork + stack swap, so we can get away w/ our
         * own stack swap if we restore before the glibc asm code takes over.
         */
        /* i#754: set stack to be XSTATE aligned for saving YMM registers */
        ASSERT(ALIGNED(XSTATE_ALIGNMENT, REGPARM_END_ALIGN));
        *app_thread_xsp = ALIGN_BACKWARD(record, XSTATE_ALIGNMENT);
# ifdef MACOS
    }
# endif
#elif defined(ARM)
    /* FIXME i#1551: NYI on ARM */
    ASSERT_NOT_IMPLEMENTED(false);
#endif /* X86/ARM */

    return (void *) record;
}

/* This is to support dr_create_client_thread() */
void
set_clone_record_fields(void *record, reg_t app_thread_xsp, app_pc continuation_pc,
                        uint clone_sysnum, uint clone_flags)
{
    clone_record_t *rec = (clone_record_t *) record;
    ASSERT(rec != NULL);
    rec->app_thread_xsp = app_thread_xsp;
    rec->continuation_pc = continuation_pc;
    rec->clone_sysnum = clone_sysnum;
    rec->clone_flags = clone_flags;
}

/* i#149/PR 403015: The clone record is passed to the new thread by placing it
 * at the bottom of the dstack, i.e., the high memory.  So the new thread gets
 * it from the base of the dstack.  The dstack is then set as the app stack.
 *
 * CAUTION: don't use a lot of stack in this routine as it gets invoked on the
 *          dstack from new_thread_setup - this is because this routine assumes
 *          no more than a page of dstack has been used so far since the clone
 *          system call was done.
 */
void *
get_clone_record(reg_t xsp)
{
    clone_record_t *record;
    byte *dstack_base;

    /* xsp should be in a dstack, i.e., dynamorio heap.  */
    ASSERT(is_dynamo_address((app_pc) xsp));

    /* The (size of the clone record +
     *      stack used by new_thread_start (only for setting up priv_mcontext_t) +
     *      stack used by new_thread_setup before calling get_clone_record())
     * is less than a page.  This is verified by the assert below.  If it does
     * exceed a page, it won't happen at random during runtime, but in a
     * predictable way during development, which will be caught by the assert.
     * The current usage is about 800 bytes for clone_record +
     * sizeof(priv_mcontext_t) + few words in new_thread_setup before
     * get_clone_record() is called.
     */
    dstack_base = (byte *) ALIGN_FORWARD(xsp, PAGE_SIZE);
    record = (clone_record_t *) (dstack_base - sizeof(clone_record_t));

    /* dstack_base and the dstack in the clone record should be the same. */
    ASSERT(dstack_base == record->dstack);
#ifdef MACOS
    ASSERT(record->app_thread_xsp != 0); /* else it's not in dstack */
#endif
    return (void *) record;
}

/* i#149/PR 403015: App xsp is passed to the new thread via the clone record. */
reg_t
get_clone_record_app_xsp(void *record)
{
    ASSERT(record != NULL);
    return ((clone_record_t *) record)->app_thread_xsp;
}

#ifdef MACOS
void *
get_clone_record_thread_arg(void *record)
{
    ASSERT(record != NULL);
    return ((clone_record_t *) record)->thread_arg;
}
#endif

byte *
get_clone_record_dstack(void *record)
{
    ASSERT(record != NULL);
    return ((clone_record_t *) record)->dstack;
}

/* Initializes info's app_sigaction, restorer_valid, and we_intercept fields */
static void
signal_info_init_sigaction(dcontext_t *dcontext, thread_sig_info_t *info)
{
    info->app_sigaction = (kernel_sigaction_t **)
        handler_alloc(dcontext, SIGARRAY_SIZE * sizeof(kernel_sigaction_t *));
    memset(info->app_sigaction, 0, SIGARRAY_SIZE * sizeof(kernel_sigaction_t *));
    memset(&info->restorer_valid, -1, SIGARRAY_SIZE * sizeof(info->restorer_valid[0]));
    info->we_intercept = (bool *) handler_alloc(dcontext, SIGARRAY_SIZE * sizeof(bool));
    memset(info->we_intercept, 0, SIGARRAY_SIZE * sizeof(bool));
}

/* Cleans up info's app_sigaction, restorer_valid, and we_intercept fields */
static void
signal_info_exit_sigaction(dcontext_t *dcontext, thread_sig_info_t *info,
                           bool other_thread)
{
    int i;
    kernel_sigaction_t act;
    memset(&act, 0, sizeof(act));
    act.handler = (handler_t) SIG_DFL;
    kernel_sigemptyset(&act.mask); /* does mask matter for SIG_DFL? */
    for (i = 1; i <= MAX_SIGNUM; i++) {
        if (!other_thread) {
            if (info->app_sigaction[i] != NULL) {
                /* restore to old handler, but not if exiting whole
                 * process: else may get itimer during cleanup, so we
                 * set to SIG_IGN (we'll have to fix once we impl detach)
                 */
                if (dynamo_exited) {
                    info->app_sigaction[i]->handler = (handler_t) SIG_IGN;
                    sigaction_syscall(i, info->app_sigaction[i], NULL);
                }
                LOG(THREAD, LOG_ASYNCH, 2, "\trestoring "PFX" as handler for %d\n",
                    info->app_sigaction[i]->handler, i);
                sigaction_syscall(i, info->app_sigaction[i], NULL);
            } else if (info->we_intercept[i]) {
                /* restore to default */
                LOG(THREAD, LOG_ASYNCH, 2, "\trestoring SIG_DFL as handler for %d\n", i);
                sigaction_syscall(i, &act, NULL);
            }
        }
        if (info->app_sigaction[i] != NULL) {
            handler_free(dcontext, info->app_sigaction[i],
                         sizeof(kernel_sigaction_t));
        }
    }
    handler_free(dcontext, info->app_sigaction,
                 SIGARRAY_SIZE * sizeof(kernel_sigaction_t *));
    handler_free(dcontext, info->we_intercept, SIGARRAY_SIZE * sizeof(bool));
}

/* Called once a new thread's dcontext is created.
 * Inherited and shared fields are set up here.
 * The clone_record contains the continuation pc, which is returned.
 */
app_pc
signal_thread_inherit(dcontext_t *dcontext, void *clone_record)
{
    app_pc res = NULL;
    clone_record_t *record = (clone_record_t *) clone_record;
    thread_sig_info_t *info = (thread_sig_info_t *) dcontext->signal_field;
    kernel_sigaction_t oldact;
    int i, rc;
    if (record != NULL) {
        app_pc continuation_pc = record->continuation_pc;
        LOG(THREAD, LOG_ASYNCH, 1,
            "continuation pc is "PFX"\n", continuation_pc);
        LOG(THREAD, LOG_ASYNCH, 1,
            "parent tid is "TIDFMT", parent sysnum is %d(%s), clone flags="PIFX"\n",
            record->caller_id, record->clone_sysnum,
            (record->clone_sysnum == SYS_vfork) ? "vfork" :
            (IF_LINUX(record->clone_sysnum == SYS_clone ? "clone" :)
             IF_MACOS(record->clone_sysnum == SYS_bsdthread_create ? "bsdthread_create":)
             "unexpected"), record->clone_flags);
        if (record->clone_sysnum == SYS_vfork) {
            /* The above clone_flags argument is bogus.
               SYS_vfork doesn't have a free register to keep the hardcoded value
               see /usr/src/linux/arch/i386/kernel/process.c */
            /* CHECK: is this the only place real clone flags are needed? */
            record->clone_flags = CLONE_VFORK | CLONE_VM | SIGCHLD;
        }

        /* handlers are either inherited or shared */
        if (TEST(CLONE_SIGHAND, record->clone_flags)) {
            /* need to share table of handlers! */
            LOG(THREAD, LOG_ASYNCH, 2, "sharing signal handlers with parent\n");
            info->shared_app_sigaction = true;
            info->shared_refcount = record->info.shared_refcount;
            info->shared_lock = record->info.shared_lock;
            info->app_sigaction = record->info.app_sigaction;
            info->we_intercept = record->info.we_intercept;
            mutex_lock(info->shared_lock);
            (*info->shared_refcount)++;
#ifdef DEBUG
            for (i = 1; i <= MAX_SIGNUM; i++) {
                if (info->app_sigaction[i] != NULL) {
                    LOG(THREAD, LOG_ASYNCH, 2, "\thandler for signal %d is "PFX"\n",
                        i, info->app_sigaction[i]->handler);
                }
            }
#endif
            mutex_unlock(info->shared_lock);
        } else {
            /* copy handlers */
            LOG(THREAD, LOG_ASYNCH, 2, "inheriting signal handlers from parent\n");
            info->app_sigaction = (kernel_sigaction_t **)
                handler_alloc(dcontext, SIGARRAY_SIZE * sizeof(kernel_sigaction_t *));
            memset(info->app_sigaction, 0, SIGARRAY_SIZE * sizeof(kernel_sigaction_t *));
            for (i = 1; i <= MAX_SIGNUM; i++) {
                info->restorer_valid[i] = -1;  /* clear cache */
                if (record->info.app_sigaction[i] != NULL) {
                    info->app_sigaction[i] = (kernel_sigaction_t *)
                        handler_alloc(dcontext, sizeof(kernel_sigaction_t));
                    memcpy(info->app_sigaction[i], record->info.app_sigaction[i],
                           sizeof(kernel_sigaction_t));
                    LOG(THREAD, LOG_ASYNCH, 2, "\thandler for signal %d is "PFX"\n",
                        i, info->app_sigaction[i]->handler);
                }
            }
            info->we_intercept = (bool *)
                handler_alloc(dcontext, SIGARRAY_SIZE * sizeof(bool));
            memcpy(info->we_intercept, record->info.we_intercept,
                   SIGARRAY_SIZE * sizeof(bool));
            mutex_lock(&record->info.child_lock);
            record->info.num_unstarted_children--;
            mutex_unlock(&record->info.child_lock);
            /* this should be safe since parent should wait for us */
            mutex_lock(&record->parent_info->child_lock);
            record->parent_info->num_unstarted_children--;
            mutex_unlock(&record->parent_info->child_lock);
        }

        /* itimers are either private or shared */
        if (TEST(CLONE_THREAD, record->clone_flags) && os_itimers_thread_shared()) {
            ASSERT(record->info.shared_itimer);
            LOG(THREAD, LOG_ASYNCH, 2, "sharing itimers with parent\n");
            info->shared_itimer = true;
            info->shared_itimer_refcount = record->info.shared_itimer_refcount;
            info->shared_itimer_underDR = record->info.shared_itimer_underDR;
            info->shared_itimer_lock = record->info.shared_itimer_lock;
            info->itimer = record->info.itimer;
            acquire_recursive_lock(info->shared_itimer_lock);
            (*info->shared_itimer_refcount)++;
            release_recursive_lock(info->shared_itimer_lock);
            /* shared_itimer_underDR will be incremented in start_itimer() */
        } else {
            info->shared_itimer = false;
            init_itimer(dcontext, false/*!first thread*/);
        }

        if (APP_HAS_SIGSTACK(info)) {
            /* parent was under our control, so the real sigstack we see is just
             * the parent's being inherited -- clear it now
             */
            memset(&info->app_sigstack, 0, sizeof(stack_t));
            info->app_sigstack.ss_flags |= SS_DISABLE;
        }

        /* rest of state is never shared.
         * app_sigstack should already be in place, when we set up our sigstack
         * we asked for old sigstack.
         * FIXME: are current pending or blocked inherited?
         */
        res = continuation_pc;
#ifdef MACOS
        if (record->app_thread_xsp != 0) {
            HEAP_TYPE_FREE(GLOBAL_DCONTEXT, record, clone_record_t,
                           ACCT_THREAD_MGT, true/*prot*/);
        }
#endif
    } else {
        /* initialize in isolation */
        if (!dynamo_initialized) {
            /* Undo the early-init handler */
            signal_info_exit_sigaction(GLOBAL_DCONTEXT, &init_info,
                                       false/*!other_thread*/);
            /* Undo the unblock-all */
            sigprocmask_syscall(SIG_SETMASK, &init_sigmask, NULL, sizeof(init_sigmask));
            DOLOG(2, LOG_ASYNCH, {
                LOG(THREAD, LOG_ASYNCH, 2, "initial app signal mask:\n");
                dump_sigset(dcontext, &init_sigmask);
            });
        }

        if (APP_HAS_SIGSTACK(info)) {
            /* parent was NOT under our control, so the real sigstack we see is
             * a real sigstack that was present before we took control
             */
            LOG(THREAD, LOG_ASYNCH, 1, "app already has signal stack "PFX" - "PFX"\n",
                info->app_sigstack.ss_sp,
                info->app_sigstack.ss_sp + info->app_sigstack.ss_size);
        }

        signal_info_init_sigaction(dcontext, info);

        info->shared_itimer = false; /* we'll set to true if a child is created */
        init_itimer(dcontext, true/*first*/);

        if (DYNAMO_OPTION(intercept_all_signals)) {
            /* PR 304708: to support client signal handlers without
             * the complexity of per-thread and per-signal callbacks
             * we always intercept all signals.  We also check here
             * for handlers the app registered before our init.
             */
            for (i=1; i<=MAX_SIGNUM; i++) {
                /* cannot intercept KILL or STOP */
                if (i != SIGKILL && i != SIGSTOP &&
                    /* FIXME PR 297033: we don't support intercepting DEFAULT_STOP /
                     * DEFAULT_CONTINUE signals.  Once add support, update
                     * dr_register_signal_event() comments.
                     */
                    default_action[i] != DEFAULT_STOP &&
                    default_action[i] != DEFAULT_CONTINUE)
                    intercept_signal(dcontext, info, i);
            }
        } else {
            /* we intercept the following signals ourselves: */
            intercept_signal(dcontext, info, SIGSEGV);
            /* PR 313665: look for DR crashes on unaligned memory or mmap bounds */
            intercept_signal(dcontext, info, SIGBUS);
            /* PR 212090: the signal we use to suspend threads */
            intercept_signal(dcontext, info, SUSPEND_SIGNAL);
#ifdef PAPI
            /* use SIGPROF for updating gui so it can be distinguished from SIGVTALRM */
            intercept_signal(dcontext, info, SIGPROF);
#endif
            /* vtalarm only used with pc profiling.  it interferes w/ PAPI
             * so arm this signal only if necessary
             */
            if (INTERNAL_OPTION(profile_pcs)) {
                intercept_signal(dcontext, info, SIGVTALRM);
            }
#ifdef CLIENT_INTERFACE
            intercept_signal(dcontext, info, SIGALRM);
#endif
#ifdef SIDELINE
            intercept_signal(dcontext, info, SIGCHLD);
#endif
            /* i#61/PR 211530: the signal we use for nudges */
            intercept_signal(dcontext, info, NUDGESIG_SIGNUM);

            /* process any handlers app registered before our init */
            for (i=1; i<=MAX_SIGNUM; i++) {
                if (info->we_intercept[i]) {
                    /* intercept_signal already stored pre-existing handler */
                    continue;
                }
                rc = sigaction_syscall(i, NULL, &oldact);
                ASSERT(rc == 0
                       /* Workaround for PR 223720, which was fixed in ESX4.0 but
                        * is present in ESX3.5 and earlier: vmkernel treats
                        * 63 and 64 as invalid signal numbers.
                        */
                       IF_VMX86(|| (i >= 63 && rc == -EINVAL))
                       );
                if (rc == 0 &&
                    oldact.handler != (handler_t) SIG_DFL &&
                    oldact.handler != (handler_t) master_signal_handler) {
                    /* could be master_ if inherited */
                    /* FIXME: if app removes handler, we'll never remove ours */
                    intercept_signal(dcontext, info, i);
                    info->we_intercept[i] = false;
                }
            }
        }

        /* should be 1st thread */
        if (get_num_threads() > 1)
            ASSERT_NOT_REACHED();
        /* FIXME: any way to recover if not 1st thread? */
        res = NULL;
    }

    unblock_all_signals(&info->app_sigblocked);
    DOLOG(2, LOG_ASYNCH, {
        LOG(THREAD, LOG_ASYNCH, 2, "thread's initial app signal mask:\n");
        dump_sigset(dcontext, &info->app_sigblocked);
    });

    /* only when SIGVTALRM handler is in place should we start itimer (PR 537743) */
    if (INTERNAL_OPTION(profile_pcs)) {
        /* even if the parent thread exits, we can use a pointer to its
         * pcprofile_info b/c when shared it's process-shared and is not freed
         * until the entire process exits
         */
        pcprofile_thread_init(dcontext, info->shared_itimer,
                              (record == NULL) ? NULL : record->pcprofile_info);
    }

    /* Assumed to be async safe. */
    info->fully_initialized = true;

    return res;
}

/* When taking over existing app threads, we assume they're using pthreads and
 * expect to share signal handlers, memory, thread group id, etc.
 */
void
share_siginfo_after_take_over(dcontext_t *dcontext, dcontext_t *takeover_dc)
{
    clone_record_t crec;
    thread_sig_info_t *parent_siginfo =
        (thread_sig_info_t*)takeover_dc->signal_field;
    /* Create a fake clone record with the given siginfo.  All threads in the
     * same thread group must share signal handlers since Linux 2.5.35, but we
     * have to guess at the other flags.
     * FIXME i#764: If we take over non-pthreads threads, we'll need some way to
     * tell if they're sharing signal handlers or not.
     */
    crec.caller_id = takeover_dc->owning_thread;
#ifdef LINUX
    crec.clone_sysnum = SYS_clone;
#else
    ASSERT_NOT_IMPLEMENTED(false); /* FIXME i#58: NYI on Mac */
#endif
    crec.clone_flags = PTHREAD_CLONE_FLAGS;
    crec.parent_info = parent_siginfo;
    crec.info = *parent_siginfo;
    signal_thread_inherit(dcontext, &crec);
}

/* This is split from os_fork_init() so the new logfiles are available
 * (xref i#189/PR 452168).  It had to be after dynamo_other_thread_exit()
 * called in dynamorio_fork_init() after os_fork_init() else we clean
 * up data structs used in signal_thread_exit().
 */
void
signal_fork_init(dcontext_t *dcontext)
{
    thread_sig_info_t *info = (thread_sig_info_t *) dcontext->signal_field;
    int i;
    /* Child of fork is a single thread in a new process so should
     * start over w/ no sharing (xref i#190/PR 452178)
     */
    if (info->shared_app_sigaction) {
        info->shared_app_sigaction = false;
        if (info->shared_lock != NULL) {
            DELETE_LOCK(*info->shared_lock);
            global_heap_free(info->shared_lock, sizeof(mutex_t) HEAPACCT(ACCT_OTHER));
        }
        if (info->shared_refcount != NULL)
            global_heap_free(info->shared_refcount, sizeof(int) HEAPACCT(ACCT_OTHER));
        info->shared_lock = NULL;
        info->shared_refcount = NULL;
    }
    if (info->shared_itimer) {
        /* itimers are not inherited across fork */
        info->shared_itimer = false;
        if (os_itimers_thread_shared())
            global_heap_free(info->itimer, sizeof(*info->itimer) HEAPACCT(ACCT_OTHER));
        else
            heap_free(dcontext, info->itimer, sizeof(*info->itimer) HEAPACCT(ACCT_OTHER));
        info->itimer = NULL;  /* reset by init_itimer */
        ASSERT(info->shared_itimer_lock != NULL);
        DELETE_RECURSIVE_LOCK(*info->shared_itimer_lock);
        global_heap_free(info->shared_itimer_lock, sizeof(*info->shared_itimer_lock)
                         HEAPACCT(ACCT_OTHER));
        info->shared_itimer_lock = NULL;
        ASSERT(info->shared_itimer_refcount != NULL);
        global_heap_free(info->shared_itimer_refcount, sizeof(int) HEAPACCT(ACCT_OTHER));
        info->shared_itimer_refcount = NULL;
        ASSERT(info->shared_itimer_underDR != NULL);
        global_heap_free(info->shared_itimer_underDR, sizeof(int) HEAPACCT(ACCT_OTHER));
        info->shared_itimer_underDR = NULL;
        init_itimer(dcontext, true/*first*/);
    }
    info->num_unstarted_children = 0;
    for (i = 1; i <= MAX_SIGNUM; i++) {
        /* "A child created via fork(2) initially has an empty pending signal set" */
        dcontext->signals_pending = false;
        while (info->sigpending[i] != NULL) {
            sigpending_t *temp = info->sigpending[i];
            info->sigpending[i] = temp->next;
            special_heap_free(info->sigheap, temp);
        }
    }
    if (INTERNAL_OPTION(profile_pcs)) {
        pcprofile_fork_init(dcontext);
    }

    /* Assumed to be async safe. */
    info->fully_initialized = true;
}

#ifdef DEBUG
static bool
sigsegv_handler_is_ours(void)
{
    int rc;
    kernel_sigaction_t oldact;
    rc = sigaction_syscall(SIGSEGV, NULL, &oldact);
    return (rc == 0 && oldact.handler == (handler_t)master_signal_handler);
}
#endif /* DEBUG */

void
signal_thread_exit(dcontext_t *dcontext, bool other_thread)
{
    thread_sig_info_t *info = (thread_sig_info_t *) dcontext->signal_field;
    int i;

    /* i#1012: DR's signal handler should always be installed before this point.
     */
    ASSERT(sigsegv_handler_is_ours() || removed_sig_handler);

    while (info->num_unstarted_children > 0) {
        /* must wait for children to start and copy our state
         * before we destroy it!
         */
        os_thread_yield();
    }

    if (dynamo_exited) {
        /* stop itimers before removing signal handlers */
        for (i = 0; i < NUM_ITIMERS; i++)
            set_actual_itimer(dcontext, i, info, false/*disable*/);
    }

    /* FIXME: w/ shared handlers, if parent (the owner here) dies,
     * can children keep living w/ a copy of the handlers?
     */
    if (info->shared_app_sigaction) {
        mutex_lock(info->shared_lock);
        (*info->shared_refcount)--;
        mutex_unlock(info->shared_lock);
    }
    if (!info->shared_app_sigaction || *info->shared_refcount == 0) {
        LOG(THREAD, LOG_ASYNCH, 2, "signal handler cleanup:\n");
        signal_info_exit_sigaction(dcontext, info, other_thread);
        if (info->shared_lock != NULL) {
            DELETE_LOCK(*info->shared_lock);
            global_heap_free(info->shared_lock, sizeof(mutex_t) HEAPACCT(ACCT_OTHER));
        }
        if (info->shared_refcount != NULL)
            global_heap_free(info->shared_refcount, sizeof(int) HEAPACCT(ACCT_OTHER));
    }

    if (info->shared_itimer) {
        acquire_recursive_lock(info->shared_itimer_lock);
        (*info->shared_itimer_refcount)--;
        release_recursive_lock(info->shared_itimer_lock);
    }
    if (!info->shared_itimer || *info->shared_itimer_refcount == 0) {
        if (INTERNAL_OPTION(profile_pcs)) {
            /* no cleanup needed for non-final thread in group */
            pcprofile_thread_exit(dcontext);
        }
        if (os_itimers_thread_shared())
            global_heap_free(info->itimer, sizeof(*info->itimer) HEAPACCT(ACCT_OTHER));
        else
            heap_free(dcontext, info->itimer, sizeof(*info->itimer) HEAPACCT(ACCT_OTHER));
        if (info->shared_itimer_lock != NULL) {
            DELETE_RECURSIVE_LOCK(*info->shared_itimer_lock);
            global_heap_free(info->shared_itimer_lock, sizeof(recursive_lock_t)
                             HEAPACCT(ACCT_OTHER));
            ASSERT(info->shared_itimer_refcount != NULL);
            global_heap_free(info->shared_itimer_refcount, sizeof(int)
                             HEAPACCT(ACCT_OTHER));
            ASSERT(info->shared_itimer_underDR != NULL);
            global_heap_free(info->shared_itimer_underDR, sizeof(int)
                             HEAPACCT(ACCT_OTHER));
        }
    }
    for (i = 1; i <= MAX_SIGNUM; i++) {
        /* pending queue is per-thread and not shared */
        while (info->sigpending[i] != NULL) {
            sigpending_t *temp = info->sigpending[i];
            info->sigpending[i] = temp->next;
            special_heap_free(info->sigheap, temp);
        }
    }
#ifdef HAVE_SIGALTSTACK
    /* Remove our sigstack and restore the app sigstack if it had one.  */
    LOG(THREAD, LOG_ASYNCH, 2, "removing our signal stack "PFX" - "PFX"\n",
        info->sigstack.ss_sp, info->sigstack.ss_sp + info->sigstack.ss_size);
    if (APP_HAS_SIGSTACK(info)) {
        LOG(THREAD, LOG_ASYNCH, 2, "restoring app signal stack "PFX" - "PFX"\n",
            info->app_sigstack.ss_sp,
            info->app_sigstack.ss_sp + info->app_sigstack.ss_size);
    } else {
        ASSERT(TEST(SS_DISABLE, info->app_sigstack.ss_flags));
    }
    if (info->sigstack.ss_sp != NULL) {
        /* i#552: to raise client exit event, we may call dynamo_process_exit
         * on sigstack in signal handler.
         * In that case we set sigstack (ss_sp) NULL to avoid stack swap.
         */
        i = sigaltstack_syscall(&info->app_sigstack, NULL);
        ASSERT(i == 0);
    }
#endif
    IF_LINUX(signalfd_thread_exit(dcontext, info));
    special_heap_exit(info->sigheap);
    DELETE_LOCK(info->child_lock);
#ifdef DEBUG
    /* for non-debug we do fast exit path and don't free local heap */
# ifdef HAVE_SIGALTSTACK
    if (info->sigstack.ss_sp != NULL) {
        /* i#552: to raise client exit event, we may call dynamo_process_exit
         * on sigstack in signal handler.
         * In that case we set sigstack (ss_sp) NULL to avoid stack free.
         */
        stack_free(info->sigstack.ss_sp + info->sigstack.ss_size,
                   info->sigstack.ss_size);
    }
# endif
    HEAP_TYPE_FREE(dcontext, info, thread_sig_info_t, ACCT_OTHER, PROTECTED);
#endif
#ifdef PAPI
    /* use SIGPROF for updating gui so it can be distinguished from SIGVTALRM */
    set_itimer_callback(dcontext, ITIMER_PROF, 500,
                        (void (*func)(dcontext_t *, priv_mcontext_t *))
                        perfctr_update_gui());
#endif
}

static void
set_our_handler_sigact(kernel_sigaction_t *act, int sig)
{
    act->handler = (handler_t) master_signal_handler;
#ifdef MACOS
    /* This is the real target */
    act->tramp = (tramp_t) master_signal_handler;
#endif

    act->flags = SA_SIGINFO; /* send 3 args to handler */
#ifdef HAVE_SIGALTSTACK
    act->flags |= SA_ONSTACK; /* use our sigstack */
#endif

#if defined(X64) && !defined(VMX86_SERVER) && defined(LINUX)
    /* PR 305020: must have SA_RESTORER for x64 */
    act->flags |= SA_RESTORER;
    act->restorer = (void (*)(void)) dynamorio_sigreturn;
#endif

    /* We block most signals within our handler */
    kernel_sigfillset(&act->mask);
    /* i#184/PR 450670: we let our suspend signal interrupt our own handler
     * We never send more than one before resuming, so no danger to stack usage
     * from our own: but app could pile them up.
     */
    kernel_sigdelset(&act->mask, SUSPEND_SIGNAL);
    /* i#193/PR 287309: we need to NOT suppress further SIGSEGV, for decode faults,
     * for try/except, and for !HAVE_MEMINFO probes.
     * Just like SUSPEND_SIGNAL, if app sends repeated SEGV, could run out of
     * alt stack: seems too corner-case to be worth increasing stack size.
     */
    kernel_sigdelset(&act->mask, SIGSEGV);
    if (sig == SUSPEND_SIGNAL || sig == SIGSEGV)
        act->flags |= SA_NODEFER;
    /* Sigset is a 1 or 2 elt array of longs on X64/X86.  Treat as 2 elt of
     * uint32. */
    IF_DEBUG(uint32 *mask_sig = (uint32*)&act->mask.sig[0]);
    LOG(THREAD_GET, LOG_ASYNCH, 3,
        "mask for our handler is "PFX" "PFX"\n", mask_sig[0], mask_sig[1]);
}

/* Set up master_signal_handler as the handler for signal "sig",
 * for the current thread.  Since we deal with kernel data structures
 * in our interception of system calls, we use them here as well,
 * to avoid having to translate to/from libc data structures.
 */
static void
intercept_signal(dcontext_t *dcontext, thread_sig_info_t *info, int sig)
{
    int rc;
    kernel_sigaction_t act;
    kernel_sigaction_t oldact;
    ASSERT(sig <= MAX_SIGNUM);

    set_our_handler_sigact(&act, sig);
    /* arm the signal */
    rc = sigaction_syscall(sig, &act, &oldact);
    ASSERT(rc == 0
           /* Workaround for PR 223720, which was fixed in ESX4.0 but
            * is present in ESX3.5 and earlier: vmkernel treats
            * 63 and 64 as invalid signal numbers.
            */
           IF_VMX86(|| (sig >= 63 && rc == -EINVAL))
           );
    if (rc != 0) /* be defensive: app will probably still work */
        return;

    if (oldact.handler != (handler_t) SIG_DFL &&
        oldact.handler != (handler_t) master_signal_handler) {
        /* save the app's action for sig */
        if (info->shared_app_sigaction) {
            /* app_sigaction structure is shared */
            mutex_lock(info->shared_lock);
        }
        if (info->app_sigaction[sig] != NULL) {
            /* go ahead and toss the old one, it's up to the app to store
             * and then restore later if it wants to
             */
            handler_free(dcontext, info->app_sigaction[sig], sizeof(kernel_sigaction_t));
        }
        info->app_sigaction[sig] = (kernel_sigaction_t *)
            handler_alloc(dcontext, sizeof(kernel_sigaction_t));
        memcpy(info->app_sigaction[sig], &oldact, sizeof(kernel_sigaction_t));
        /* clear cache */
        info->restorer_valid[sig] = -1;
        if (info->shared_app_sigaction)
            mutex_unlock(info->shared_lock);
#ifdef DEBUG
        if (oldact.handler == (handler_t) SIG_IGN) {
            LOG(THREAD, LOG_ASYNCH, 2,
                "app already installed SIG_IGN as sigaction for signal %d\n", sig);
        } else {
            LOG(THREAD, LOG_ASYNCH, 2,
                "app already installed "PFX" as sigaction for signal %d\n",
                oldact.handler, sig);
        }
#endif
    }

    LOG(THREAD, LOG_ASYNCH, 3, "\twe intercept signal %d\n", sig);
    info->we_intercept[sig] = true;
}

/**** system call handlers ***********************************************/

/* FIXME: invalid pointer passed to kernel will currently show up
 * probably as a segfault in our handlers below...need to make them
 * look like kernel, and pass error code back to os.c
 */

void
handle_clone(dcontext_t *dcontext, uint flags)
{
    thread_sig_info_t *info = (thread_sig_info_t *) dcontext->signal_field;
    if ((flags & CLONE_VM) == 0) {
        /* separate process not sharing memory */
        if ((flags & CLONE_SIGHAND) != 0) {
            /* FIXME: how deal with this?
             * "man clone" says: "Since Linux 2.6.0-test6, flags must also
             * include CLONE_VM if CLONE_SIGHAND is specified"
             */
            LOG(THREAD, LOG_ASYNCH, 1, "WARNING: !CLONE_VM but CLONE_SIGHAND!\n");
            ASSERT_NOT_IMPLEMENTED(false);
        }
        return;
    }

    pre_second_thread();

    if ((flags & CLONE_SIGHAND) != 0) {
        /* need to share table of handlers! */
        LOG(THREAD, LOG_ASYNCH, 2, "handle_clone: CLONE_SIGHAND set!\n");
        if (!info->shared_app_sigaction) {
            /* this is the start of a chain of sharing
             * no synch needed here, child not created yet
             */
            info->shared_app_sigaction = true;
            info->shared_refcount = (int *) global_heap_alloc(sizeof(int)
                                                              HEAPACCT(ACCT_OTHER));
            *info->shared_refcount = 1;
            info->shared_lock = (mutex_t *) global_heap_alloc(sizeof(mutex_t)
                                                            HEAPACCT(ACCT_OTHER));
            ASSIGN_INIT_LOCK_FREE(*info->shared_lock, shared_lock);
        } /* else, some ancestor is already owner */
   } else {
        /* child will inherit copy of current table -> cannot modify it
         * until child is scheduled!  FIXME: any other way?
         */
        mutex_lock(&info->child_lock);
        info->num_unstarted_children++;
        mutex_unlock(&info->child_lock);
    }

    if (TEST(CLONE_THREAD, flags) && os_itimers_thread_shared()) {
        if (!info->shared_itimer) {
            /* this is the start of a chain of sharing
             * no synch needed here, child not created yet
             */
            info->shared_itimer = true;
            info->shared_itimer_refcount = (int *)
                global_heap_alloc(sizeof(int) HEAPACCT(ACCT_OTHER));
            *info->shared_itimer_refcount = 1;
            info->shared_itimer_underDR = (int *)
                global_heap_alloc(sizeof(int) HEAPACCT(ACCT_OTHER));
            *info->shared_itimer_underDR = 1;
            info->shared_itimer_lock = (recursive_lock_t *)
                global_heap_alloc(sizeof(*info->shared_itimer_lock) HEAPACCT(ACCT_OTHER));
            ASSIGN_INIT_RECURSIVE_LOCK_FREE(*info->shared_itimer_lock, shared_itimer_lock);
        } /* else, some ancestor already created */
    }
}

/* Returns false if should NOT issue syscall.
 */
bool
handle_sigaction(dcontext_t *dcontext, int sig, const kernel_sigaction_t *act,
                 kernel_sigaction_t *oact, size_t sigsetsize)
{
    thread_sig_info_t *info = (thread_sig_info_t *) dcontext->signal_field;
    kernel_sigaction_t *save;
    kernel_sigaction_t *non_const_act = (kernel_sigaction_t *) act;
    /* i#1035: app may pass invalid signum to find MAX_SIGNUM */
    if (sig <= MAX_SIGNUM && act != NULL) {
        /* app is installing a new action */

        while (info->num_unstarted_children > 0) {
            /* must wait for children to start and copy our state
             * before we modify it!
             */
            os_thread_yield();
        }

        if (info->shared_app_sigaction) {
            /* app_sigaction structure is shared */
            mutex_lock(info->shared_lock);
        }

        if (act->handler == (handler_t) SIG_IGN ||
            act->handler == (handler_t) SIG_DFL) {
            LOG(THREAD, LOG_ASYNCH, 2,
                "app installed %s as sigaction for signal %d\n",
                (act->handler == (handler_t) SIG_IGN) ? "SIG_IGN" : "SIG_DFL", sig);
            if (!info->we_intercept[sig]) {
                /* let the SIG_IGN/SIG_DFL go through, we want to remove our
                 * handler.  we delete the stored app_sigaction in post_
                 */
                if (info->shared_app_sigaction)
                    mutex_unlock(info->shared_lock);
                return true;
            }
        } else {
            LOG(THREAD, LOG_ASYNCH, 2,
                "app installed "PFX" as sigaction for signal %d\n",
                act->handler, sig);
            DOLOG(2, LOG_ASYNCH, {
                LOG(THREAD, LOG_ASYNCH, 2, "signal mask for handler:\n");
                dump_sigset(dcontext, (kernel_sigset_t *) &act->mask);
            });
        }

        /* save app's entire sigaction struct */
        save = (kernel_sigaction_t *) handler_alloc(dcontext, sizeof(kernel_sigaction_t));
        memcpy(save, act, sizeof(kernel_sigaction_t));
        if (info->app_sigaction[sig] != NULL) {
            /* go ahead and toss the old one, it's up to the app to store
             * and then restore later if it wants to
             */
            handler_free(dcontext, info->app_sigaction[sig], sizeof(kernel_sigaction_t));
        }
        info->app_sigaction[sig] = save;
        LOG(THREAD, LOG_ASYNCH, 3, "\tflags = "PFX", %s = "PFX"\n",
            act->flags, IF_MACOS_ELSE("tramp","restorer"),
            IF_MACOS_ELSE(act->tramp, act->restorer));
        /* clear cache */
        info->restorer_valid[sig] = -1;
        if (info->shared_app_sigaction)
            mutex_unlock(info->shared_lock);

        if (info->we_intercept[sig]) {
            /* cancel the syscall */
            return false;
        }
        /* now hand kernel our master handler instead of app's
         * FIXME: double-check we're dealing w/ all possible mask, flag
         * differences between app & our handler
         */
        set_our_handler_sigact(non_const_act, sig);

        /* FIXME PR 297033: we don't support intercepting DEFAULT_STOP /
         * DEFAULT_CONTINUE signals b/c we can't generate the default
         * action: if the app registers a handler, though, we should work
         * properly if we never see SIG_DFL.
         */
    }

    /* oact is handled post-syscall */

    return true;
}

/* os.c thinks it's passing us struct_sigaction, really it's kernel_sigaction_t,
 * which has fields in different order.
 */
void
handle_post_sigaction(dcontext_t *dcontext, int sig, const kernel_sigaction_t *act,
                      kernel_sigaction_t *oact, size_t sigsetsize)
{
    thread_sig_info_t *info = (thread_sig_info_t *) dcontext->signal_field;
    /* this is only called on success, so sig must be in the valid range */
    ASSERT(sig <= MAX_SIGNUM && sig > 0);
    if (oact != NULL) {
        /* FIXME: hold lock across the syscall?!?
         * else could be modified and get wrong old action?
         */
        /* FIXME: make sure oact is readable & writable before accessing! */
        if (info->shared_app_sigaction)
            mutex_lock(info->shared_lock);
        if (info->app_sigaction[sig] == NULL) {
            if (info->we_intercept[sig]) {
                /* need to pretend there is no handler */
                memset(oact, 0, sizeof(*oact));
                oact->handler = (handler_t) SIG_DFL;
            } else {
                ASSERT(oact->handler == (handler_t) SIG_IGN ||
                       oact->handler == (handler_t) SIG_DFL);
            }
        } else {
            memcpy(oact, info->app_sigaction[sig], sizeof(kernel_sigaction_t));

            /* if installing IGN or DFL, delete ours */
            if (act && ((act->handler == (handler_t) SIG_IGN ||
                         act->handler == (handler_t) SIG_DFL) &&
                        !info->we_intercept[sig])) {
                /* remove old stored app action */
                handler_free(dcontext, info->app_sigaction[sig],
                             sizeof(kernel_sigaction_t));
                info->app_sigaction[sig] = NULL;
            }
        }
        if (info->shared_app_sigaction)
            mutex_unlock(info->shared_lock);
    }
}

/* Returns false if should NOT issue syscall */
bool
handle_sigaltstack(dcontext_t *dcontext, const stack_t *stack,
                   stack_t *old_stack)
{
    thread_sig_info_t *info = (thread_sig_info_t *) dcontext->signal_field;
    if (old_stack != NULL) {
        *old_stack = info->app_sigstack;
    }
    if (stack != NULL) {
        info->app_sigstack = *stack;
        LOG(THREAD, LOG_ASYNCH, 2, "app set up signal stack "PFX" - "PFX" %s\n",
            stack->ss_sp, stack->ss_sp + stack->ss_size - 1,
            (APP_HAS_SIGSTACK(info)) ? "enabled" : "disabled");
        return false; /* always cancel syscall */
    }
    return true;
}

/* Blocked signals:
 * In general, we don't need to keep track of blocked signals.
 * We only need to do so for those signals we intercept ourselves.
 * Thus, info->app_sigblocked ONLY contains entries for signals
 * we intercept ourselves.
 * PR 304708: we now intercept all signals.
 */

static void
set_blocked(dcontext_t *dcontext, kernel_sigset_t *set, bool absolute)
{
    thread_sig_info_t *info = (thread_sig_info_t *) dcontext->signal_field;
    int i;
    if (absolute) {
        /* discard current blocked signals, re-set from new mask */
        kernel_sigemptyset(&info->app_sigblocked);
    } /* else, OR in the new set */
    for (i=1; i<=MAX_SIGNUM; i++) {
        if (EMULATE_SIGMASK(info, i) && kernel_sigismember(set, i)) {
            kernel_sigaddset(&info->app_sigblocked, i);
        }
    }
#ifdef DEBUG
    if (stats->loglevel >= 3 && (stats->logmask & LOG_ASYNCH) != 0) {
        LOG(THREAD, LOG_ASYNCH, 3, "blocked signals are now:\n");
        dump_sigset(dcontext, &info->app_sigblocked);
    }
#endif
}

/* Scans over info->sigpending to see if there are any unblocked, pending
 * signals, and sets dcontext->signals_pending if there are.  Do this after
 * modifying the set of signals blocked by the application.
 */
static void
check_signals_pending(dcontext_t *dcontext, thread_sig_info_t *info)
{
    int i;

    if (dcontext->signals_pending)
        return;

    for (i=1; i<=MAX_SIGNUM; i++) {
        if (info->sigpending[i] != NULL &&
            !kernel_sigismember(&info->app_sigblocked, i)) {
            /* We only update the application's set of blocked signals from
             * syscall handlers, so we know we'll go back to dispatch and see
             * this flag right away.
             */
            dcontext->signals_pending = true;
            break;
        }
    }
}

/* Returns whether to execute the syscall */
bool
handle_sigprocmask(dcontext_t *dcontext, int how, kernel_sigset_t *app_set,
                   kernel_sigset_t *oset, size_t sigsetsize)
{
    thread_sig_info_t *info = (thread_sig_info_t *) dcontext->signal_field;
    int i;
    kernel_sigset_t safe_set;
    /* If we're intercepting all, we emulate the whole thing */
    bool execute_syscall = !DYNAMO_OPTION(intercept_all_signals);
    LOG(THREAD, LOG_ASYNCH, 2, "handle_sigprocmask\n");
    if (oset != NULL)
        info->pre_syscall_app_sigblocked = info->app_sigblocked;
    if (app_set != NULL && safe_read(app_set, sizeof(safe_set), &safe_set)) {
        if (execute_syscall) {
            /* The syscall will execute, so remove from the set passed
             * to it.   We restore post-syscall.
             * XXX i#1187: we could crash here touching app memory -- could
             * use TRY, but the app could pass read-only memory and it
             * would work natively!  Better to swap in our own
             * allocated data struct.  There's a transparency issue w/
             * races too if another thread looks at this memory.  This
             * won't happen by default b/c -intercept_all_signals is
             * on by default so we don't try to solve all these
             * issues.
             */
            info->pre_syscall_app_sigprocmask = safe_set;
        }
        if (how == SIG_BLOCK) {
            /* The set of blocked signals is the union of the current
             * set and the set argument.
             */
            for (i=1; i<=MAX_SIGNUM; i++) {
                if (EMULATE_SIGMASK(info, i) && kernel_sigismember(&safe_set, i)) {
                    kernel_sigaddset(&info->app_sigblocked, i);
                    if (execute_syscall)
                        kernel_sigdelset(app_set, i);
                }
            }
        } else if (how == SIG_UNBLOCK) {
            /* The signals in set are removed from the current set of
             *  blocked signals.
             */
            for (i=1; i<=MAX_SIGNUM; i++) {
                if (EMULATE_SIGMASK(info, i) && kernel_sigismember(&safe_set, i)) {
                    kernel_sigdelset(&info->app_sigblocked, i);
                    if (execute_syscall)
                        kernel_sigdelset(app_set, i);
                }
            }
        } else if (how == SIG_SETMASK) {
            /* The set of blocked signals is set to the argument set. */
            kernel_sigemptyset(&info->app_sigblocked);
            for (i=1; i<=MAX_SIGNUM; i++) {
                if (EMULATE_SIGMASK(info, i) && kernel_sigismember(&safe_set, i)) {
                    kernel_sigaddset(&info->app_sigblocked, i);
                    if (execute_syscall)
                        kernel_sigdelset(app_set, i);
                }
            }
        }
#ifdef DEBUG
        if (stats->loglevel >= 3 && (stats->logmask & LOG_ASYNCH) != 0) {
            LOG(THREAD, LOG_ASYNCH, 3, "blocked signals are now:\n");
            dump_sigset(dcontext, &info->app_sigblocked);
        }
#endif
        /* make sure we deliver pending signals that are now unblocked
         * FIXME: consider signal #S, which we intercept ourselves.
         * If S arrives, then app blocks it prior to our delivering it,
         * we then won't deliver it until app unblocks it...is this a
         * problem?  Could have arrived a little later and then we would
         * do same thing, but this way kernel may send one more than would
         * get w/o dynamo?  This goes away if we deliver signals
         * prior to letting app do a syscall.
         */
        check_signals_pending(dcontext, info);
    }
    if (!execute_syscall) {
        handle_post_sigprocmask(dcontext, how, app_set, oset, sigsetsize);
        return false; /* skip syscall */
    } else
        return true;
}

/* need to add in our signals that the app thinks are blocked */
void
handle_post_sigprocmask(dcontext_t *dcontext, int how, kernel_sigset_t *app_set,
                        kernel_sigset_t *oset, size_t sigsetsize)
{
    thread_sig_info_t *info = (thread_sig_info_t *) dcontext->signal_field;
    int i;
    if (!DYNAMO_OPTION(intercept_all_signals)) {
        /* Restore app memory */
        safe_write_ex(app_set, sizeof(*app_set), &info->pre_syscall_app_sigprocmask,
                      NULL);
    }
    if (oset != NULL) {
        if (DYNAMO_OPTION(intercept_all_signals))
            safe_write_ex(oset, sizeof(*oset), &info->pre_syscall_app_sigblocked, NULL);
        else {
            /* the syscall wrote to oset already, so just add any additional */
            for (i=1; i<=MAX_SIGNUM; i++) {
                if (EMULATE_SIGMASK(info, i) &&
                    /* use the pre-syscall value: do not take into account changes
                     * from this syscall itself! (PR 523394)
                     */
                    kernel_sigismember(&info->pre_syscall_app_sigblocked, i)) {
                    kernel_sigaddset(oset, i);
                }
            }
        }
    }
}

void
handle_sigsuspend(dcontext_t *dcontext, kernel_sigset_t *set,
                  size_t sigsetsize)
{
    thread_sig_info_t *info = (thread_sig_info_t *) dcontext->signal_field;
    int i;
    ASSERT(set != NULL);
    LOG(THREAD, LOG_ASYNCH, 2, "handle_sigsuspend\n");
    info->in_sigsuspend = true;
    info->app_sigblocked_save = info->app_sigblocked;
    kernel_sigemptyset(&info->app_sigblocked);
    for (i=1; i<=MAX_SIGNUM; i++) {
        if (EMULATE_SIGMASK(info, i) && kernel_sigismember(set, i)) {
            kernel_sigaddset(&info->app_sigblocked, i);
            kernel_sigdelset(set, i);
        }
    }
#ifdef DEBUG
    if (stats->loglevel >= 3 && (stats->logmask & LOG_ASYNCH) != 0) {
        LOG(THREAD, LOG_ASYNCH, 3, "in sigsuspend, blocked signals are now:\n");
        dump_sigset(dcontext, &info->app_sigblocked);
    }
#endif
}

/**** utility routines ***********************************************/
#ifdef DEBUG
static void
dump_sigset(dcontext_t *dcontext, kernel_sigset_t *set)
{
    int sig;
    for (sig=1; sig<=MAX_SIGNUM; sig++) {
        if (kernel_sigismember(set, sig))
            LOG(THREAD, LOG_ASYNCH, 1, "\t%d = blocked\n", sig);
    }
}
#endif /* DEBUG */

/* PR 205795: to avoid lock problems w/ in_fcache (it grabs a lock, we
 * could have interrupted someone holding that), we first check
 * whereami --- if whereami is WHERE_FCACHE we still check the pc
 * to distinguish generated routines, but at least we're certain
 * it's not in DR where it could own a lock.
 * We can't use is_on_dstack() here b/c we need to handle clean call
 * arg crashes -- which is too bad since checking client dll and DR dll is
 * not sufficient due to calls to ntdll, libc, or pc being in gencode.
 */
static bool
safe_is_in_fcache(dcontext_t *dcontext, app_pc pc, app_pc xsp)
{
    if (dcontext->whereami != WHERE_FCACHE ||
        IF_CLIENT_INTERFACE(is_in_client_lib(pc) ||)
        is_in_dynamo_dll(pc) ||
        is_on_initstack(xsp))
        return false;
    /* Reasonably certain not in DR code, so no locks should be held */
    return in_fcache(pc);
}

static bool
safe_is_in_coarse_stubs(dcontext_t *dcontext, app_pc pc, app_pc xsp)
{
    if (dcontext->whereami != WHERE_FCACHE ||
        IF_CLIENT_INTERFACE(is_in_client_lib(pc) ||)
        is_in_dynamo_dll(pc) ||
        is_on_initstack(xsp))
        return false;
    /* Reasonably certain not in DR code, so no locks should be held */
    return in_coarse_stubs(pc);
}

static bool
is_on_alt_stack(dcontext_t *dcontext, byte *sp)
{
#ifdef HAVE_SIGALTSTACK
    thread_sig_info_t *info = (thread_sig_info_t *) dcontext->signal_field;
    return (sp >= (byte *) info->sigstack.ss_sp &&
            /* deliberate equality check since stacks often init to top */
            sp <= (byte *) (info->sigstack.ss_sp + info->sigstack.ss_size));
#else
    return false;
#endif
}

void
sigcontext_to_mcontext(priv_mcontext_t *mc, sigcontext_t *sc)
{
    ASSERT(mc != NULL && sc != NULL);
#ifdef X86
    mc->xax = sc->SC_XAX;
    mc->xbx = sc->SC_XBX;
    mc->xcx = sc->SC_XCX;
    mc->xdx = sc->SC_XDX;
    mc->xsi = sc->SC_XSI;
    mc->xdi = sc->SC_XDI;
    mc->xbp = sc->SC_XBP;
    mc->xsp = sc->SC_XSP;
    mc->xflags = sc->SC_XFLAGS;
    mc->pc = (app_pc) sc->SC_XIP;
# ifdef X64
    mc->r8  = sc->SC_FIELD(r8);
    mc->r9  = sc->SC_FIELD(r9);
    mc->r10 = sc->SC_FIELD(r10);
    mc->r11 = sc->SC_FIELD(r11);
    mc->r12 = sc->SC_FIELD(r12);
    mc->r13 = sc->SC_FIELD(r13);
    mc->r14 = sc->SC_FIELD(r14);
    mc->r15 = sc->SC_FIELD(r15);
# endif /* X64 */
#elif defined (ARM)
    mc->r0  = sc->SC_FIELD(arm_r0);
    mc->r1  = sc->SC_FIELD(arm_r1);
    mc->r2  = sc->SC_FIELD(arm_r2);
    mc->r3  = sc->SC_FIELD(arm_r3);
    mc->r4  = sc->SC_FIELD(arm_r4);
    mc->r5  = sc->SC_FIELD(arm_r5);
    mc->r6  = sc->SC_FIELD(arm_r6);
    mc->r7  = sc->SC_FIELD(arm_r7);
    mc->r8  = sc->SC_FIELD(arm_r8);
    mc->r9  = sc->SC_FIELD(arm_r9);
    mc->r10 = sc->SC_FIELD(arm_r10);
    mc->r11 = sc->SC_FIELD(arm_fp);
    mc->r12 = sc->SC_FIELD(arm_ip);
    mc->r13 = sc->SC_FIELD(arm_sp);
    mc->r14 = sc->SC_FIELD(arm_lr);
    mc->r15 = sc->SC_FIELD(arm_pc);
# ifdef X64
#  error NYI on AArch64
# endif /* X64 */
#endif /* X86/ARM */
    sigcontext_to_mcontext_simd(mc, sc);
}

/* Note that unlike mcontext_to_context(), this routine does not fill in
 * any state that is not present in the mcontext: in particular, it assumes
 * the sigcontext already contains the native fpstate.  If the caller
 * is generating a synthetic sigcontext, the caller should call
 * save_fpstate() before calling this routine.
 */
void
mcontext_to_sigcontext(sigcontext_t *sc, priv_mcontext_t *mc)
{
#ifdef X86
    sc->SC_XAX = mc->xax;
    sc->SC_XBX = mc->xbx;
    sc->SC_XCX = mc->xcx;
    sc->SC_XDX = mc->xdx;
    sc->SC_XSI = mc->xsi;
    sc->SC_XDI = mc->xdi;
    sc->SC_XBP = mc->xbp;
    sc->SC_XSP = mc->xsp;
    sc->SC_XFLAGS = mc->xflags;
    sc->SC_XIP = (ptr_uint_t) mc->pc;
# ifdef X64
    sc->SC_FIELD(r8)  = mc->r8;
    sc->SC_FIELD(r9)  = mc->r9;
    sc->SC_FIELD(r10) = mc->r10;
    sc->SC_FIELD(r11) = mc->r11;
    sc->SC_FIELD(r12) = mc->r12;
    sc->SC_FIELD(r13) = mc->r13;
    sc->SC_FIELD(r14) = mc->r14;
    sc->SC_FIELD(r15) = mc->r15;
# endif /* X64 */
#elif defined(ARM)
    sc->SC_FIELD(arm_r0)  = mc->r0;
    sc->SC_FIELD(arm_r1)  = mc->r1;
    sc->SC_FIELD(arm_r2)  = mc->r2;
    sc->SC_FIELD(arm_r3)  = mc->r3;
    sc->SC_FIELD(arm_r4)  = mc->r4;
    sc->SC_FIELD(arm_r5)  = mc->r5;
    sc->SC_FIELD(arm_r6)  = mc->r6;
    sc->SC_FIELD(arm_r7)  = mc->r7;
    sc->SC_FIELD(arm_r8)  = mc->r8;
    sc->SC_FIELD(arm_r9)  = mc->r9;
    sc->SC_FIELD(arm_r10) = mc->r10;
    sc->SC_FIELD(arm_fp)  = mc->r11;
    sc->SC_FIELD(arm_ip)  = mc->r12;
    sc->SC_FIELD(arm_sp)  = mc->r13;
    sc->SC_FIELD(arm_lr)  = mc->r14;
    sc->SC_FIELD(arm_pc)  = mc->r15;
# ifdef X64
#  error NYI on AArch64
# endif /* X64 */
#endif /* X86/ARM */
    mcontext_to_sigcontext_simd(sc, mc);
}

/* Returns whether successful.  If avoid_failure, tries to translate
 * at least pc if not successful.  Pass f if known.
 */
static bool
translate_sigcontext(dcontext_t *dcontext,  sigcontext_t *sc, bool avoid_failure,
                     fragment_t *f)
{
    bool success = false;
    priv_mcontext_t mcontext;

    sigcontext_to_mcontext(&mcontext, sc);
    /* FIXME: if cannot find exact match, we're in trouble!
     * probably ok to delay, since that indicates not a synchronous
     * signal.
     */
    /* FIXME : in_fcache() (called by recreate_app_state) grabs fcache
     * fcache_unit_areas.lock, we could deadlock! Also on initexit_lock
     * == PR 205795/1317
     */
    /* For safe recreation we need to either be couldbelinking or hold the
     * initexit lock (to keep someone from flushing current fragment), the
     * initexit lock is easier
     */
    mutex_lock(&thread_initexit_lock);
    /* PR 214962: we assume we're going to relocate to this stored context,
     * so we restore memory now
     */
    if (translate_mcontext(dcontext->thread_record, &mcontext,
                           true/*restore memory*/, f)) {
        mcontext_to_sigcontext(sc, &mcontext);
        success = true;
    } else {
        if (avoid_failure) {
            ASSERT_NOT_REACHED(); /* is ok to break things, is UNIX :) */
            /* FIXME : what to do? reg state might be wrong at least get pc */
            if (safe_is_in_fcache(dcontext, (cache_pc)sc->SC_XIP, (app_pc)sc->SC_XSP)) {
                sc->SC_XIP = (ptr_uint_t)recreate_app_pc(dcontext, mcontext.pc, f);
                ASSERT(sc->SC_XIP != (ptr_uint_t)NULL);
            } else {
                /* FIXME : can't even get pc right, what do we do here? */
                sc->SC_XIP = 0;
            }
        }
    }
    mutex_unlock(&thread_initexit_lock);
    LOG(THREAD, LOG_ASYNCH, 3,
        "\ttranslate_sigcontext: just set frame's eip to "PFX"\n", sc->SC_XIP);
    return success;
}

/* Takes an os-specific context */
void
thread_set_self_context(void *cxt)
{
    dcontext_t *dcontext = get_thread_private_dcontext();
    /* Unlike Windows we can't say "only set this subset of the
     * full machine state", so we need to get the rest of the state,
     */
    sigframe_rt_t frame;
#if defined(LINUX) || defined(DEBUG)
    sigcontext_t *sc = (sigcontext_t *) cxt;
#endif
    app_pc xsp_for_sigreturn;
#ifdef VMX86_SERVER
    ASSERT_NOT_IMPLEMENTED(false); /* PR 405694: can't use regular sigreturn! */
#endif
    memset(&frame, 0, sizeof(frame));
#ifdef LINUX
# ifdef X86
    /* We need room for full xstate if nec (this is x86=944, x64=832 bytes).
     * A real signal frame would be var-sized but we don't want to dynamically
     * allocate, and only the kernel looks at this, so no risk of some
     * app seeing a weird frame size.
     */
    struct _xstate __attribute__ ((aligned (AVX_ALIGNMENT))) xstate;
    frame.uc.uc_mcontext.fpstate = &xstate.fpstate;
# endif /* X86 */
    frame.uc.uc_mcontext = *sc;
#endif
    save_fpstate(dcontext, &frame);
    /* The kernel calls do_sigaltstack on sys_rt_sigreturn primarily to ensure
     * the frame is ok, but the side effect is we can mess up our own altstack
     * settings if we're not careful.  Having invalid ss_size looks good for
     * kernel 2.6.23.9 at least so we leave frame.uc.uc_stack as all zeros.
     */
    /* make sure sigreturn's mask setting doesn't change anything */
    sigprocmask_syscall(SIG_SETMASK, NULL, (kernel_sigset_t *) &frame.uc.uc_sigmask,
                        sizeof(frame.uc.uc_sigmask));
    LOG(THREAD_GET, LOG_ASYNCH, 2, "thread_set_self_context: pc="PFX"\n", sc->SC_XIP);
    /* set up xsp to point at &frame + sizeof(char*) */
    xsp_for_sigreturn = ((app_pc)&frame) + sizeof(char*);
#ifdef X86
    asm("mov  %0, %%"ASM_XSP : : "m"(xsp_for_sigreturn));
# ifdef MACOS
    ASSERT_NOT_IMPLEMENTED(false && "need to pass 2 params to SYS_sigreturn");
    asm("jmp _dynamorio_sigreturn");
# else
    asm("jmp dynamorio_sigreturn");
# endif /* MACOS/LINUX */
#elif defined(ARM)
    asm("ldr  "ASM_XSP", %0" : : "m"(xsp_for_sigreturn));
    asm("b    dynamorio_sigreturn");
#endif /* X86/ARM */
    ASSERT_NOT_REACHED();
}

/* Takes a priv_mcontext_t */
void
thread_set_self_mcontext(priv_mcontext_t *mc)
{
    sigcontext_t sc;
    mcontext_to_sigcontext(&sc, mc);
    thread_set_self_context((void *)&sc);
    ASSERT_NOT_REACHED();
}

#ifdef LINUX
static bool
sig_has_restorer(thread_sig_info_t *info, int sig)
{
#ifdef VMX86_SERVER
    /* vmkernel ignores SA_RESTORER (PR 405694) */
    return false;
#endif
    if (info->app_sigaction[sig] == NULL)
        return false;
    if (TEST(SA_RESTORER, info->app_sigaction[sig]->flags))
        return true;
    if (info->app_sigaction[sig]->restorer == NULL)
        return false;
    /* we cache the result due to the safe_read cost */
    if (info->restorer_valid[sig] == -1) {
        /* With older kernels, don't seem to need flag: if sa_restorer !=
         * NULL kernel will use it.  But with newer kernels that's not
         * true, and sometimes libc does pass non-NULL.
         */
        /* Signal restorer code for Ubuntu 7.04:
         *   0xffffe420 <__kernel_sigreturn+0>:      pop    %eax
         *   0xffffe421 <__kernel_sigreturn+1>:      mov    $0x77,%eax
         *   0xffffe426 <__kernel_sigreturn+6>:      int    $0x80
         *
         *   0xffffe440 <__kernel_rt_sigreturn+0>:   mov    $0xad,%eax
         *   0xffffe445 <__kernel_rt_sigreturn+5>:   int    $0x80
         */
        static const byte SIGRET_NONRT[8] =
          {0x58, 0xb8, 0x77, 0x00, 0x00, 0x00, 0xcd, 0x80};
        static const byte SIGRET_RT[8] =
          {0xb8, 0xad, 0x00, 0x00, 0x00, 0xcd, 0x80};
        byte buf[MAX(sizeof(SIGRET_NONRT), sizeof(SIGRET_RT))]= {0};
        if (safe_read(info->app_sigaction[sig]->restorer, sizeof(buf), buf) &&
            ((IS_RT_FOR_APP(info, sig) &&
              memcmp(buf, SIGRET_RT, sizeof(SIGRET_RT)) == 0) ||
             (!IS_RT_FOR_APP(info, sig) &&
              memcmp(buf, SIGRET_NONRT, sizeof(SIGRET_NONRT)) == 0))) {
            LOG(THREAD_GET, LOG_ASYNCH, 2,
                "sig_has_restorer %d: "PFX" looks like restorer, using w/o flag\n",
                sig, info->app_sigaction[sig]->restorer);
            info->restorer_valid[sig] = 1;
        } else
            info->restorer_valid[sig] = 0;
    }
    return (info->restorer_valid[sig] == 1);
}
#endif

/* Returns the size of the frame for delivering to the app.
 * For x64 this does NOT include struct _fpstate.
 */
static uint
get_app_frame_size(thread_sig_info_t *info, int sig)
{
    if (IS_RT_FOR_APP(info, sig))
        return sizeof(sigframe_rt_t);
#ifdef LINUX
    else
        return sizeof(sigframe_plain_t);
#endif
}

sigcontext_t *
get_sigcontext_from_rt_frame(sigframe_rt_t *frame)
{
#if defined(MACOS) && !defined(X64)
    /* Padding makes it unsafe to access uc on frame from kernel */
    return SIGCXT_FROM_UCXT(frame->puc);
#else
    return SIGCXT_FROM_UCXT(&frame->uc);
#endif
}

static sigcontext_t *
get_sigcontext_from_app_frame(thread_sig_info_t *info, int sig, void *frame)
{
    sigcontext_t *sc = NULL; /* initialize to satisfy Mac clang */
    bool rtframe = IS_RT_FOR_APP(info, sig);
    if (rtframe)
        sc = get_sigcontext_from_rt_frame((sigframe_rt_t *)frame);
#ifdef LINUX
    else
        sc = (sigcontext_t *) &(((sigframe_plain_t *)frame)->sc);
#endif
    return sc;
}

static sigcontext_t *
get_sigcontext_from_pending(thread_sig_info_t *info, int sig)
{
    ASSERT(info->sigpending[sig] != NULL);
    return get_sigcontext_from_rt_frame(&info->sigpending[sig]->rt_frame);
}

/* Returns the address on the appropriate signal stack where we should copy
 * the frame.
 * If frame is NULL, assumes signal happened while in DR and has been delayed,
 * and thus we need to provide fpstate regardless of whether the original
 * had it.  If frame is non-NULL, matches frame's amount of fpstate.
 */
static byte *
get_sigstack_frame_ptr(dcontext_t *dcontext, int sig, sigframe_rt_t *frame)
{
    thread_sig_info_t *info = (thread_sig_info_t *) dcontext->signal_field;
    sigcontext_t *sc = (frame == NULL) ?
        get_sigcontext_from_pending(info, sig) :
        get_sigcontext_from_rt_frame(frame);
    byte *sp;

    if (frame != NULL) {
        /* signal happened while in cache, grab interrupted xsp */
        sp = (byte *) sc->SC_XSP;
        LOG(THREAD, LOG_ASYNCH, 3,
            "get_sigstack_frame_ptr: using frame's xsp "PFX"\n", sp);
    } else {
        /* signal happened while in DR, use stored xsp */
        sp = (byte *) get_mcontext(dcontext)->xsp;
        LOG(THREAD, LOG_ASYNCH, 3, "get_sigstack_frame_ptr: using app xsp "PFX"\n", sp);
    }

    if (APP_HAS_SIGSTACK(info)) {
        /* app has own signal stack */
        LOG(THREAD, LOG_ASYNCH, 3,
            "get_sigstack_frame_ptr: app has own stack "PFX"\n",
            info->app_sigstack.ss_sp);
        LOG(THREAD, LOG_ASYNCH, 3,
            "\tcur sp="PFX" vs app stack "PFX"-"PFX"\n",
            sp, info->app_sigstack.ss_sp,
            info->app_sigstack.ss_sp + info->app_sigstack.ss_size);
        if (sp > (byte *)info->app_sigstack.ss_sp &&
            sp - (byte *)info->app_sigstack.ss_sp < info->app_sigstack.ss_size) {
            /* we're currently in the alt stack, so use current xsp */
            LOG(THREAD, LOG_ASYNCH, 3,
                "\tinside alt stack, so using current xsp "PFX"\n", sp);
        } else {
            /* need to go to top, stack grows down */
            sp = info->app_sigstack.ss_sp + info->app_sigstack.ss_size;
            LOG(THREAD, LOG_ASYNCH, 3,
                "\tnot inside alt stack, so using base xsp "PFX"\n", sp);
        }
    }
    /* now get frame pointer: need to go down to first field of frame */
    sp -= get_app_frame_size(info, sig);
#if defined(LINUX) && defined(X86)
    if (frame == NULL) {
        /* XXX i#641: we always include space for full xstate,
         * even if we don't use it all, which does not match what the
         * kernel does, but we're not tracking app actions to know whether
         * we can skip lazy fpstate on the delay
         */
        sp -= XSTATE_FRAME_EXTRA;
    } else {
        if (sc->fpstate != NULL) {
            /* The kernel doesn't seem to lazily include avx, so we don't either,
             * which simplifies all our frame copying: if YMM_ENABLED() and the
             * fpstate pointer is non-NULL, then we assume there's space for
             * full xstate
             */
            sp -= XSTATE_FRAME_EXTRA;
            DOCHECK(1, {
                if (YMM_ENABLED()) {
                    ASSERT_CURIOSITY(sc->fpstate->sw_reserved.magic1 == FP_XSTATE_MAGIC1);
                    ASSERT(sc->fpstate->sw_reserved.extended_size <= XSTATE_FRAME_EXTRA);
                }
            });
        }
    }
#endif /* LINUX && X86 */
    /* PR 369907: don't forget the redzone */
    sp -= REDZONE_SIZE;

    /* Align to 16-bytes.  The kernel does this for both 32 and 64-bit code
     * these days, so we do as well.
     */
    sp = (byte *) ALIGN_BACKWARD(sp, 16);
    sp -= sizeof(reg_t);  /* Model retaddr. */

    LOG(THREAD, LOG_ASYNCH, 3, "\tplacing frame at "PFX"\n", sp);
    return sp;
}

#if defined(LINUX) && !defined(X64)
static void
convert_frame_to_nonrt(dcontext_t *dcontext, int sig, sigframe_rt_t *f_old,
                       sigframe_plain_t *f_new)
{
# ifdef X86
    sigcontext_t *sc_old = get_sigcontext_from_rt_frame(f_old);
# endif /* X86 */
    f_new->pretcode = f_old->pretcode;
    f_new->sig = f_old->sig;
    memcpy(&f_new->sc, get_sigcontext_from_rt_frame(f_old), sizeof(sigcontext_t));
# ifdef X86
    if (sc_old->fpstate != NULL) {
        /* up to caller to include enough space for fpstate at end */
        byte *new_fpstate = (byte *)
            ALIGN_FORWARD(((byte *)f_new) + sizeof(*f_new), XSTATE_ALIGNMENT);
        memcpy(new_fpstate, sc_old->fpstate, XSTATE_DATA_SIZE);
        f_new->sc.fpstate = (struct _fpstate *) new_fpstate;
    }
# endif /* X86 */
    f_new->sc.oldmask = f_old->uc.uc_sigmask.sig[0];
    memcpy(&f_new->extramask, &f_old->uc.uc_sigmask.sig[1],
           (_NSIG_WORDS-1) * sizeof(uint));
    memcpy(&f_new->retcode, &f_old->retcode, RETCODE_SIZE);
    LOG(THREAD, LOG_ASYNCH, 3, "\tconverted rt frame to non-rt frame\n");
    /* now fill in our extra field */
    f_new->sig_noclobber = f_new->sig;
}

/* separated out to avoid the stack size cost on the common path */
static void
convert_frame_to_nonrt_partial(dcontext_t *dcontext, int sig, sigframe_rt_t *f_old,
                               sigframe_plain_t *f_new, size_t size)
{
# ifdef X86
    char frame_plus_xstate[sizeof(sigframe_plain_t) + AVX_FRAME_EXTRA];
    sigframe_plain_t *f_plain = (sigframe_plain_t *) frame_plus_xstate;
    convert_frame_to_nonrt(dcontext, sig, f_old, f_plain);
    memcpy(f_new, f_plain, size);
# elif defined(ARM)
    /* FIXME i#1551: NYI on ARM */
    ASSERT_NOT_IMPLEMENTED(false);
# endif /* X86/ARM */
}
#endif

/* Exported for call from master_signal_handler asm routine.
 * For the rt signal frame f_old that was copied to f_new, updates
 * the intra-frame absolute pointers to point to the new addresses
 * in f_new.
 * Only updates the pretcode to the stored app restorer if for_app.
 */
void
fixup_rtframe_pointers(dcontext_t *dcontext, int sig,
                       sigframe_rt_t *f_old, sigframe_rt_t *f_new, bool for_app)
{
    if (dcontext == NULL)
        dcontext = get_thread_private_dcontext();
    ASSERT(dcontext != NULL);
#ifdef LINUX
    thread_sig_info_t *info = (thread_sig_info_t *) dcontext->signal_field;
    bool has_restorer = sig_has_restorer(info, sig);
#  ifdef DEBUG
    uint level = 3;
#    if !defined(HAVE_MEMINFO)
    /* avoid logging every single TRY probe fault */
    if (!dynamo_initialized)
        level = 5;
#    endif
#  endif

    if (has_restorer && for_app)
        f_new->pretcode = (char *) info->app_sigaction[sig]->restorer;
    else {
#  ifdef VMX86_SERVER
        /* PR 404712: skip kernel's restorer code */
        if (for_app)
            f_new->pretcode = (char *) dynamorio_sigreturn;
#  else
#    ifdef X64
        ASSERT(!for_app);
#    else
        /* only point at retcode if old one was -- with newer OS, points at
         * vsyscall page and there is no restorer, yet stack restorer code left
         * there for gdb compatibility
         */
        if (f_old->pretcode == f_old->retcode)
            f_new->pretcode = f_new->retcode;
        /* else, pointing at vsyscall, or we set it to dynamorio_sigreturn in
         * master_signal_handler
         */
        LOG(THREAD, LOG_ASYNCH, level, "\tleaving pretcode with old value\n");
#    endif
#  endif
    }
#endif /* LINUX */
#ifndef X64
    f_new->pinfo = &(f_new->info);
    f_new->puc = &(f_new->uc);
#endif
#ifdef X86
# ifdef LINUX
    if (f_old->uc.uc_mcontext.fpstate != NULL) {
        uint frame_size = get_app_frame_size(info, sig);
        byte *frame_end = ((byte *)f_new) + frame_size;
        byte *tgt = (byte *) ALIGN_FORWARD(frame_end, XSTATE_ALIGNMENT);
        ASSERT(tgt - frame_end <= XSTATE_FRAME_EXTRA);
        memcpy(tgt, f_old->uc.uc_mcontext.fpstate, sizeof(struct _fpstate));
        f_new->uc.uc_mcontext.fpstate = (struct _fpstate *) tgt;
        if (YMM_ENABLED()) {
            struct _xstate *xstate_new = (struct _xstate *) tgt;
            struct _xstate *xstate_old = (struct _xstate *) f_old->uc.uc_mcontext.fpstate;
            memcpy(&xstate_new->xstate_hdr, &xstate_old->xstate_hdr,
                   sizeof(xstate_new->xstate_hdr));
            memcpy(&xstate_new->ymmh, &xstate_old->ymmh, sizeof(xstate_new->ymmh));
        }
        LOG(THREAD, LOG_ASYNCH, level+1, "\tfpstate old="PFX" new="PFX"\n",
            f_old->uc.uc_mcontext.fpstate, f_new->uc.uc_mcontext.fpstate);
    } else {
        /* if fpstate is not set up, we're delivering signal immediately,
         * and we shouldn't need an fpstate since DR code won't modify it;
         * only if we delayed will we need it, and when delaying we make
         * room and set up the pointer in copy_frame_to_pending.
         * xref i#641.
         */
        LOG(THREAD, LOG_ASYNCH, level+1, "\tno fpstate needed\n");
    }
    LOG(THREAD, LOG_ASYNCH, level, "\tretaddr = "PFX"\n", f_new->pretcode);
#  ifdef RETURN_AFTER_CALL
    info->signal_restorer_retaddr = (app_pc) f_new->pretcode;
#  endif
    /* 32-bit kernel copies to aligned buf first */
    IF_X64(ASSERT(ALIGNED(f_new->uc.uc_mcontext.fpstate, 16)));
# elif defined(MACOS)
    f_new->puc->uc_mcontext = (IF_X64_ELSE(_STRUCT_MCONTEXT64, _STRUCT_MCONTEXT32) *)
        &f_new->mc;
    LOG(THREAD, LOG_ASYNCH, 3, "\tf_new="PFX", handler="PFX"\n", f_new, &f_new->handler);
    ASSERT(!for_app || ALIGNED(&f_new->handler, 16));
# endif /* LINUX */
#endif /* X86 */
}

static void
memcpy_rt_frame(sigframe_rt_t *frame, byte *dst, bool from_pending)
{
#if defined(MACOS) && !defined(X64)
    if (!from_pending) {
        /* The kernel puts padding in the middle.  We collapse that padding here
         * and re-align when we copy to the app stack.
         * We should not reference fields from mc onward in what the kernel put
         * on the stack, as our sigframe_rt_t layout does not match the kernel's
         * variable mid-struct padding.
         */
        sigcontext_t *sc = SIGCXT_FROM_UCXT(frame->puc);
        memcpy(dst, frame, offsetof(sigframe_rt_t, puc) + sizeof(frame->puc));
        memcpy(&((sigframe_rt_t*)dst)->mc, sc,
               sizeof(sigframe_rt_t) - offsetof(sigframe_rt_t, mc));
        return;
    }
#endif
    memcpy(dst, frame, sizeof(sigframe_rt_t));
}

/* Copies frame to sp.
 * PR 304708: we now leave in rt form right up until we copy to the
 * app stack, so that we can deliver to a client at a safe spot
 * in rt form, so this routine now converts to a plain frame if necessary.
 * If no restorer, touches up pretcode
 * (and if rt_frame, touches up pinfo and puc)
 * Also touches up fpstate pointer
 */
static void
copy_frame_to_stack(dcontext_t *dcontext, int sig, sigframe_rt_t *frame, byte *sp,
                    bool from_pending)
{
    thread_sig_info_t *info = (thread_sig_info_t *) dcontext->signal_field;
    bool rtframe = IS_RT_FOR_APP(info, sig);
    uint frame_size = get_app_frame_size(info, sig);
#if defined(LINUX) && !defined(X64)
    bool has_restorer = sig_has_restorer(info, sig);
#endif
    byte *check_pc;
    uint size = frame_size;
#if defined(LINUX) && defined(X86)
    sigcontext_t *sc = get_sigcontext_from_rt_frame(frame);
    size += (sc->fpstate == NULL ? 0 : XSTATE_FRAME_EXTRA);
#endif /* LINUX && X86 */

    LOG(THREAD, LOG_ASYNCH, 3, "copy_frame_to_stack: rt=%d, src="PFX", sp="PFX"\n",
        rtframe, frame, sp);

    /* before we write to the app's stack we need to see if it's writable */
    check_pc = (byte *) ALIGN_BACKWARD(sp, PAGE_SIZE);
    while (check_pc < (byte *)sp + size) {
        uint prot;
        DEBUG_DECLARE(bool ok = )
            get_memory_info(check_pc, NULL, NULL, &prot);
        ASSERT(ok);
        if (!TEST(MEMPROT_WRITE, prot)) {
            size_t rest = (byte *)sp + size - check_pc;
            if (is_executable_area_writable(check_pc)) {
                LOG(THREAD, LOG_ASYNCH, 2,
                    "\tcopy_frame_to_stack: part of stack is unwritable-by-us @"PFX"\n",
                    check_pc);
                flush_fragments_and_remove_region(dcontext, check_pc, rest,
                                                  false /* don't own initexit_lock */,
                                                  false /* keep futures */);
            } else {
                LOG(THREAD, LOG_ASYNCH, 2,
                    "\tcopy_frame_to_stack: part of stack is unwritable @"PFX"\n",
                    check_pc);
                /* copy what we can */
                if (rtframe)
                    memcpy(sp, frame, rest);
#if defined(LINUX) && !defined(X64)
                else {
                    convert_frame_to_nonrt_partial(dcontext, sig, frame,
                                                   (sigframe_plain_t *) sp, rest);
                }
#endif
                /* now throw exception
                 * FIXME: what give as address?  what does kernel use?
                 * If the app intercepts SIGSEGV then we'll come right back
                 * here, so we terminate explicitly instead.  FIXME: set exit
                 * code properly: xref PR 205310.
                 */
                if (info->app_sigaction[SIGSEGV] == NULL)
                    os_forge_exception(0, UNREADABLE_MEMORY_EXECUTION_EXCEPTION);
                else
                    os_terminate(dcontext, TERMINATE_PROCESS);
                ASSERT_NOT_REACHED();
            }
        }
        check_pc += PAGE_SIZE;
    }
    if (rtframe) {
        ASSERT(frame_size == sizeof(*frame));
        memcpy_rt_frame(frame, sp, from_pending);
    }
#if defined(LINUX) && !defined(X64)
    else
        convert_frame_to_nonrt(dcontext, sig, frame, (sigframe_plain_t *) sp);
#endif

    /* if !has_restorer we do NOT add the restorer code to the exec list here,
     * to avoid removal problems (if handler never returns) and consistency problems
     * (would have to mark as selfmod right now if on stack).
     * for PROGRAM_SHEPHERDING we recognize as a pattern, and for consistency we
     * allow entire region once try to execute -- not a performance worry since should
     * very rarely be on the stack: should either be libc restorer code or with recent
     * OS in rx vsyscall page.
     */

    /* fix up pretcode, pinfo, puc, fpstate */
    if (rtframe) {
        fixup_rtframe_pointers(dcontext, sig, frame, (sigframe_rt_t *) sp,
                               true/*for app*/);
    }
#ifdef LINUX
    else {
#  ifdef X64
        ASSERT_NOT_REACHED();
#  else
        sigframe_plain_t *f_new = (sigframe_plain_t *) sp;
#    ifndef VMX86_SERVER
        sigframe_plain_t *f_old = (sigframe_plain_t *) frame;
#    endif
        if (has_restorer)
            f_new->pretcode = (char *) info->app_sigaction[sig]->restorer;
        else {
#    ifdef VMX86_SERVER
            /* PR 404712: skip kernel's restorer code */
            f_new->pretcode = (char *) dynamorio_nonrt_sigreturn;
#    else
            /* see comments in rt case above */
            if (f_old->pretcode == f_old->retcode)
                f_new->pretcode = f_new->retcode;
            else {
                /* whether we set to dynamorio_sigreturn in master_signal_handler
                 * or it's still vsyscall page, we have to convert to non-rt
                 */
                f_new->pretcode = (char *) dynamorio_nonrt_sigreturn;
            } /* else, pointing at vsyscall most likely */
            LOG(THREAD, LOG_ASYNCH, 3, "\tleaving pretcode with old value\n");
#    endif
        }
        /* convert_frame_to_nonrt*() should have updated fpstate pointer.
         * The inlined fpstate is no longer used on new kernels, and we do that
         * as well on older kernels.
         */
        IF_X86(ASSERT(f_new->sc.fpstate != &f_new->fpstate));
        LOG(THREAD, LOG_ASYNCH, 3, "\tretaddr = "PFX"\n", f_new->pretcode);
#  ifdef RETURN_AFTER_CALL
        info->signal_restorer_retaddr = (app_pc) f_new->pretcode;
#  endif
        /* 32-bit kernel copies to aligned buf so no assert on fpstate alignment */
#  endif /* X64 */
    }
#endif /* LINUX */

#ifdef MACOS
    /* Update handler field, which is passed to the libc trampoline, to app */
    ASSERT(info->app_sigaction[sig] != NULL);
    ((sigframe_rt_t *)sp)->handler = (app_pc) info->app_sigaction[sig]->handler;
#endif
}

/* Copies frame to pending slot.
 * PR 304708: we now leave in rt form right up until we copy to the
 * app stack, so that we can deliver to a client at a safe spot
 * in rt form.
 */
static void
copy_frame_to_pending(dcontext_t *dcontext, int sig, sigframe_rt_t *frame
                      _IF_CLIENT(byte *access_address))
{
    thread_sig_info_t *info = (thread_sig_info_t *) dcontext->signal_field;
    sigframe_rt_t *dst = &(info->sigpending[sig]->rt_frame);
    memcpy_rt_frame(frame, (byte *)dst, false/*!already pending*/);

#if defined(LINUX) && defined(X86)
    /* For lazy fpstate, it's possible there was no fpstate when the kernel
     * sent us the frame, but in between then and now the app executed some
     * fp or xmm/ymm instrs.  Today we always add fpstate just in case.
     * XXX i#641 optimization: track whether any fp/xmm/ymm
     * instrs happened and avoid this.
     */
    /* we'll fill in updated fpstate at delivery time, but we go ahead and
     * copy now in case our own retrieval somehow misses some fields
     */
    if (frame->uc.uc_mcontext.fpstate != NULL) {
        memcpy(&info->sigpending[sig]->xstate, frame->uc.uc_mcontext.fpstate,
               /* XXX: assuming full xstate if avx is enabled */
               XSTATE_DATA_SIZE);
    }
    /* we must set the pointer now so that later save_fpstate, etc. work */
    dst->uc.uc_mcontext.fpstate = (struct _fpstate *) &info->sigpending[sig]->xstate;
#endif /* LINUX && X86 */

#ifdef CLIENT_INTERFACE
    info->sigpending[sig]->access_address = access_address;
#endif
    info->sigpending[sig]->use_sigcontext = false;

#ifdef MACOS
    /* We rely on puc to find sc to we have to fix it up */
    fixup_rtframe_pointers(dcontext, sig, frame, dst, false/*!for app*/);
#endif

    LOG(THREAD, LOG_ASYNCH, 3, "copy_frame_to_pending\n");
    DOLOG(3, LOG_ASYNCH, {
        LOG(THREAD, LOG_ASYNCH, 3, "sigcontext:\n");
        dump_sigcontext(dcontext, get_sigcontext_from_rt_frame(dst));
    });
}

/**** real work ***********************************************/

/* transfer control from signal handler to fcache return routine */
static void
transfer_from_sig_handler_to_fcache_return(dcontext_t *dcontext, sigcontext_t *sc,
                                           app_pc next_pc, linkstub_t *last_exit)
{
    /* Set our sigreturn context to point to fcache_return!
     * Then we'll go back through kernel, appear in fcache_return,
     * and go through dispatch & interp, without messing up dynamo stack.
     * Note that even if this is a write in the shared cache, we
     * still go to the private fcache_return for simplicity.
     */
    sc->SC_XIP = (ptr_uint_t) fcache_return_routine(dcontext);

#ifdef X64
    /* x64 always uses shared gencode */
    get_local_state_extended()->spill_space.xax = sc->SC_XAX;
#else
    get_mcontext(dcontext)->IF_X86_ELSE(xax, r0) = sc->IF_X86_ELSE(SC_XAX, SC_R0);
#endif
    LOG(THREAD, LOG_ASYNCH, 2, "\tsaved xax "PFX"\n", sc->IF_X86_ELSE(SC_XAX, SC_R0));

    dcontext->next_tag = next_pc;
    sc->IF_X86_ELSE(SC_XAX, SC_R0) = (ptr_uint_t) last_exit;
    LOG(THREAD, LOG_ASYNCH, 2,
        "\tset next_tag to "PFX", resuming in fcache_return\n", next_pc);
}

#ifdef CLIENT_INTERFACE
static dr_signal_action_t
send_signal_to_client(dcontext_t *dcontext, int sig, sigframe_rt_t *frame,
                      sigcontext_t *raw_sc, byte *access_address,
                      bool blocked, fragment_t *fragment)
{
    sigcontext_t *sc = get_sigcontext_from_rt_frame(frame);
    dr_siginfo_t si;
    dr_signal_action_t action;
    if (!dr_signal_hook_exists())
        return DR_SIGNAL_DELIVER;
    LOG(THREAD, LOG_ASYNCH, 2, "sending signal to client\n");
    si.sig = sig;
    si.drcontext = (void *) dcontext;
    /* It's safe to allocate since we do not send signals that interrupt DR.
     * With priv_mcontext_t x2 that's a little big for stack alloc.
     */
    si.mcontext = heap_alloc(dcontext, sizeof(*si.mcontext) HEAPACCT(ACCT_OTHER));
    si.raw_mcontext = heap_alloc(dcontext, sizeof(*si.raw_mcontext) HEAPACCT(ACCT_OTHER));
    dr_mcontext_init(si.mcontext);
    dr_mcontext_init(si.raw_mcontext);
    /* i#207: fragment tag and fcache start pc on fault. */
    si.fault_fragment_info.tag = NULL;
    si.fault_fragment_info.cache_start_pc = NULL;
    /* i#182/PR 449996: we provide the pre-translation context */
    if (raw_sc != NULL) {
        fragment_t  wrapper;
        si.raw_mcontext_valid = true;
        sigcontext_to_mcontext(dr_mcontext_as_priv_mcontext(si.raw_mcontext), raw_sc);
        /* i#207: fragment tag and fcache start pc on fault. */
        /* FIXME: we should avoid the fragment_pclookup since it is expensive
         * and since we already did the work of a lookup when translating
         */
        if (fragment == NULL)
            fragment = fragment_pclookup(dcontext, si.raw_mcontext->pc, &wrapper);
        if (fragment != NULL && !hide_tag_from_client(fragment->tag)) {
            si.fault_fragment_info.tag = fragment->tag;
            si.fault_fragment_info.cache_start_pc = FCACHE_ENTRY_PC(fragment);
            si.fault_fragment_info.is_trace = TEST(FRAG_IS_TRACE,
                                                    fragment->flags);
            si.fault_fragment_info.app_code_consistent =
                !TESTANY(FRAG_WAS_DELETED|FRAG_SELFMOD_SANDBOXED,
                         fragment->flags);
        }
    } else
        si.raw_mcontext_valid = false;
    /* The client has no way to calculate this when using
     * instrumentation that deliberately faults (to shift a rare event
     * out of the fastpath) so we provide it.  When raw_mcontext is
     * available the client can calculate it, but we provide it as a
     * convenience anyway.
     */
    si.access_address = access_address;
    si.blocked = blocked;
    sigcontext_to_mcontext(dr_mcontext_as_priv_mcontext(si.mcontext), sc);
    /* We disallow the client calling dr_redirect_execution(), so we
     * will not leak si
     */
    action = instrument_signal(dcontext, &si);
    if (action == DR_SIGNAL_DELIVER ||
        action == DR_SIGNAL_REDIRECT) {
        /* propagate client changes */
        CLIENT_ASSERT(si.mcontext->flags == DR_MC_ALL,
                      "signal mcontext flags cannot be changed");
        mcontext_to_sigcontext(sc, dr_mcontext_as_priv_mcontext(si.mcontext));
    } else if (action == DR_SIGNAL_SUPPRESS && raw_sc != NULL) {
        /* propagate client changes */
        CLIENT_ASSERT(si.raw_mcontext->flags == DR_MC_ALL,
                      "signal mcontext flags cannot be changed");
        mcontext_to_sigcontext(raw_sc, dr_mcontext_as_priv_mcontext(si.raw_mcontext));
    }
    heap_free(dcontext, si.mcontext, sizeof(*si.mcontext) HEAPACCT(ACCT_OTHER));
    heap_free(dcontext, si.raw_mcontext, sizeof(*si.raw_mcontext) HEAPACCT(ACCT_OTHER));
    return action;
}

/* Returns false if caller should exit */
static bool
handle_client_action_from_cache(dcontext_t *dcontext, int sig, dr_signal_action_t action,
                                sigframe_rt_t *our_frame, sigcontext_t *sc_orig,
                                bool blocked)
{
    thread_sig_info_t *info = (thread_sig_info_t *) dcontext->signal_field;
    sigcontext_t *sc = get_sigcontext_from_rt_frame(our_frame);
    /* in order to pass to the client, we come all the way here for signals
     * the app has no handler for
     */
    if (action == DR_SIGNAL_REDIRECT) {
        /* send_signal_to_client copied mcontext into our
         * master_signal_handler frame, so we set up for fcache_return w/
         * the mcontext state and this as next_tag
         */
        sigcontext_to_mcontext(get_mcontext(dcontext), sc);
        transfer_from_sig_handler_to_fcache_return(dcontext, sc, (app_pc) sc->SC_XIP,
                                  (linkstub_t *) get_sigreturn_linkstub());
        if (is_building_trace(dcontext)) {
            LOG(THREAD, LOG_ASYNCH, 3, "\tsquashing trace-in-progress\n");
            trace_abort(dcontext);
        }
        return false;
    }
    else if (action == DR_SIGNAL_SUPPRESS ||
             (!blocked && info->app_sigaction[sig] != NULL &&
              info->app_sigaction[sig]->handler == (handler_t)SIG_IGN)) {
        LOG(THREAD, LOG_ASYNCH, 2, "%s: not delivering!\n",
            (action == DR_SIGNAL_SUPPRESS) ?
            "client suppressing signal" :
            "app signal handler is SIG_IGN");
        /* restore original (untranslated) sc */
        *get_sigcontext_from_rt_frame(our_frame) = *sc_orig;
        return false;
    }
    else if (!blocked && /* no BYPASS for blocked */
             (action == DR_SIGNAL_BYPASS ||
              (info->app_sigaction[sig] == NULL ||
               info->app_sigaction[sig]->handler == (handler_t)SIG_DFL))) {
        LOG(THREAD, LOG_ASYNCH, 2, "%s: executing default action\n",
            (action == DR_SIGNAL_BYPASS) ?
            "client forcing default" :
            "app signal handler is SIG_DFL");
        if (execute_default_from_cache(dcontext, sig, our_frame, sc_orig)) {
            /* if we haven't terminated, restore original (untranslated) sc
             * on request.
             */
            *get_sigcontext_from_rt_frame(our_frame) = *sc_orig;
            LOG(THREAD, LOG_ASYNCH, 2, "%s: restored xsp="PFX", xip="PFX"\n",
                __FUNCTION__, get_sigcontext_from_rt_frame(our_frame)->SC_XSP,
                get_sigcontext_from_rt_frame(our_frame)->SC_XIP);
        }
        return false;
    }
    CLIENT_ASSERT(action == DR_SIGNAL_DELIVER, "invalid signal event return value");
    return true;
}
#endif

static void
abort_on_fault(dcontext_t *dcontext, uint dumpcore_flag, app_pc pc, sigcontext_t *sc,
               const char *prefix, const char *signame, const char *where)
{
    const char *fmt =
        "%s %s at PC "PFX"\n"
        "Received SIG%s at%s pc "PFX" in thread "TIDFMT"\n"
        "Base: "PFX"\n"
        "Registers:"
#ifdef X86
        "eax="PFX" ebx="PFX" ecx="PFX" edx="PFX"\n"
        "\tesi="PFX" edi="PFX" esp="PFX" ebp="PFX"\n"
# ifdef X64
        "\tr8 ="PFX" r9 ="PFX" r10="PFX" r11="PFX"\n"
        "\tr12="PFX" r13="PFX" r14="PFX" r15="PFX"\n"
# endif /* X64 */
#elif defined(ARM)
# ifndef X64
        "  r0 ="PFX" r1 ="PFX" r2 ="PFX" r3 ="PFX"\n"
        "\tr4 ="PFX" r5 ="PFX" r6 ="PFX" r7 ="PFX"\n"
        "\tr8 ="PFX" r9 ="PFX" r10="PFX" r11="PFX"\n"
        "\tr12="PFX" r13="PFX" r14="PFX" r15="PFX"\n"
# else
#  error NYI on AArch64
# endif
#endif /* X86/ARM */
        "\teflags="PFX;

    report_dynamorio_problem(dcontext, dumpcore_flag,
                             pc, (app_pc) sc->SC_FP,
                             fmt, prefix, CRASH_NAME, pc,
                             signame, where, pc, get_thread_id(),
                             get_dynamorio_dll_start(),
#ifdef X86
                             sc->SC_XAX, sc->SC_XBX, sc->SC_XCX, sc->SC_XDX,
                             sc->SC_XSI, sc->SC_XDI, sc->SC_XSP, sc->SC_XBP,
# ifdef X64
                             sc->SC_FIELD(r8), sc->SC_FIELD(r9),
                             sc->SC_FIELD(r10), sc->SC_FIELD(r11),
                             sc->SC_FIELD(r12), sc->SC_FIELD(r13),
                             sc->SC_FIELD(r14), sc->SC_FIELD(r15),
# endif /* X86 */
#elif defined(ARM)
# ifndef X64
                             sc->SC_FIELD(arm_r0),  sc->SC_FIELD(arm_r1),
                             sc->SC_FIELD(arm_r2),  sc->SC_FIELD(arm_r3),
                             sc->SC_FIELD(arm_r4),  sc->SC_FIELD(arm_r5),
                             sc->SC_FIELD(arm_r6),  sc->SC_FIELD(arm_r7),
                             sc->SC_FIELD(arm_r8),  sc->SC_FIELD(arm_r9),
                             sc->SC_FIELD(arm_r10), sc->SC_FIELD(arm_fp),
                             sc->SC_FIELD(arm_ip),  sc->SC_FIELD(arm_sp),
                             sc->SC_FIELD(arm_lr),  sc->SC_FIELD(arm_pc),
# else
#  error NYI on AArch64
# endif /* X64 */
#endif /* X86/ARM */
                             sc->SC_XFLAGS);
    os_terminate(dcontext, TERMINATE_PROCESS);
    ASSERT_NOT_REACHED();
}

static void
abort_on_DR_fault(dcontext_t *dcontext, app_pc pc, sigcontext_t *sc,
                  const char *signame, const char *where)
{
    abort_on_fault(dcontext, DUMPCORE_INTERNAL_EXCEPTION, pc, sc,
                   exception_label_core, signame, where);
    ASSERT_NOT_REACHED();
}

/* Returns whether unlinked or mangled syscall.
 * Restored in receive_pending_signal.
 */
static bool
unlink_fragment_for_signal(dcontext_t *dcontext, fragment_t *f,
                           byte *pc/*interruption pc*/)
{
    /* We only come here if we interrupted a fragment in the cache,
     * which means that this thread's DR state is safe, and so it
     * should be ok to acquire a lock.  xref PR 596069.
     *
     * There is a race where if two threads hit a signal in the same
     * shared fragment, the first could re-link after the second
     * un-links but before the second exits, and the second could then
     * execute the syscall, resulting in arbitrary delay prior to
     * signal delivery.  We don't want to allocate global memory,
     * but we could use a static array of counters (since should
     * be small # of interrupted shared fragments at any one time)
     * used as refcounts so we only unlink when all are done.
     * Not bothering to implement now: going to live w/ chance of
     * long signal delays.  xref PR 596069.
     */
    bool changed = false;
    /* may not be linked if trace_relink or something */
    if (TEST(FRAG_COARSE_GRAIN, f->flags)) {
        /* XXX PR 213040: we don't support unlinking coarse, so we try
         * not to come here, but for indirect branch and other spots
         * where we don't yet support translation (since can't fault)
         * we end up w/ no bound on delivery...
         */
    } else if (TEST(FRAG_LINKED_OUTGOING, f->flags)) {
        LOG(THREAD, LOG_ASYNCH, 3,
            "\tunlinking outgoing for interrupted F%d\n", f->id);
        SHARED_FLAGS_RECURSIVE_LOCK(f->flags, acquire, change_linking_lock);
        unlink_fragment_outgoing(dcontext, f);
        SHARED_FLAGS_RECURSIVE_LOCK(f->flags, release, change_linking_lock);
        changed = true;
    } else {
        LOG(THREAD, LOG_ASYNCH, 3,
            "\toutgoing already unlinked for interrupted F%d\n", f->id);
    }
    if (TEST(FRAG_HAS_SYSCALL, f->flags)) {
        /* Syscalls are signal barriers!
         * Make sure the next syscall (if any) in f is not executed!
         * instead go back to dispatch right before the syscall
         */
        /* syscall mangling does a bunch of decodes but only one write,
         * changing the target of a short jmp, which is atomic
         * since a one-byte write, so we don't need the change_linking_lock.
         */
        changed = changed ||
            mangle_syscall_code(dcontext, f, pc, false/*do not skip exit cti*/);
    }
    return changed;
}

static bool
interrupted_inlined_syscall(dcontext_t *dcontext, fragment_t *f,
                            byte *pc/*interruption pc*/)
{
    bool pre_or_post_syscall = false;
    if (TEST(FRAG_HAS_SYSCALL, f->flags)) {
        /* PR 596147: if the thread is currently in an inlined
         * syscall when a signal comes in, we can't delay and bound the
         * delivery time: we need to deliver now.  Should decode
         * backward and see if syscall.  We assume our translation of
         * the interruption state is fine to re-start: i.e., the syscall
         * is complete if kernel has pc at post-syscall point, and
         * kernel set EINTR in eax if necessary.
         */
        /* Interrupted fcache, so ok to alloc memory for decode */
        instr_t instr;
        byte *nxt_pc;
        instr_init(dcontext, &instr);
        nxt_pc = decode(dcontext, pc, &instr);
        if (nxt_pc != NULL && instr_valid(&instr) &&
            instr_is_syscall(&instr)) {
            /* pre-syscall but post-jmp so can't skip syscall */
            pre_or_post_syscall = true;
        } else {
            instr_reset(dcontext, &instr);
            ASSERT(INT_LENGTH == SYSCALL_LENGTH);
            ASSERT(SYSENTER_LENGTH == SYSCALL_LENGTH);
            nxt_pc = decode(dcontext, pc - SYSCALL_LENGTH, &instr);
            if (nxt_pc != NULL && instr_valid(&instr) &&
                instr_is_syscall(&instr)) {
                /* decoding backward so check for exit cti jmp prior
                 * to syscall to ensure no mismatch
                 */
                instr_reset(dcontext, &instr);
                nxt_pc = decode(dcontext, pc - SYSCALL_LENGTH - JMP_LONG_LENGTH, &instr);
                if (nxt_pc != NULL && instr_valid(&instr) &&
                    instr_get_opcode(&instr) == OP_jmp) {
                    /* post-inlined-syscall */
                    pre_or_post_syscall = true;
                }
            }
        }
        instr_free(dcontext, &instr);
    }
    return pre_or_post_syscall;
}

/* i#1145: auto-restart syscalls interrupted by signals */
static bool
adjust_syscall_for_restart(dcontext_t *dcontext, thread_sig_info_t *info, int sig,
                           sigcontext_t *sc, fragment_t *f)
{
    byte *pc = (byte *) sc->SC_XIP;
    instr_t instr;

    if (sc->IF_X86_ELSE(SC_XAX, SC_R0) != -EINTR) {
        /* The syscall succeeded, so no reason to interrupt.
         * Some syscalls succeed on a signal coming in.
         * E.g., SYS_wait4 on SIGCHLD, or reading from a slow device.
         */
        return false;
    }
    /* Don't restart if the app's handler says not to */
    if (info->app_sigaction[sig] != NULL &&
        !TEST(SA_RESTART, info->app_sigaction[sig]->flags)) {
        return false;
    }
    /* XXX i#1145: some syscalls are never restarted when interrupted by a signal.
     * We check those that are simple to distinguish below, but not all are.  We have
     * this under an option so it can be disabled if necessary.
     */
    if (!DYNAMO_OPTION(restart_syscalls))
        return false;

    /* The kernel has already put -EINTR into eax, so we must
     * restore the syscall number.  We assume no other register or
     * memory values have been clobbered from their pre-syscall
     * values.
     */
    int sysnum = -1;
    if (f != NULL) {
        /* Inlined syscall.  I'd use find_syscall_num() but we'd need to call
         * decode_fragment() and tweak find_syscall_num() to handle the skip-syscall
         * jumps, or grab locks and call recreate_fragment_ilist() -- both are
         * heavyweight, so we do our own decode loop.
         * We assume we'll find a mov-imm b/c otherwise we wouldn't have inlined this.
         */
        LOG(THREAD, LOG_ASYNCH, 3, "%s: decoding to find syscall #\n", __FUNCTION__);
        instr_init(dcontext, &instr);
        pc = FCACHE_ENTRY_PC(f);
        do {
            DOLOG(3, LOG_ASYNCH, {
                disassemble_with_bytes(dcontext, pc, THREAD);
            });
            instr_reset(dcontext, &instr);
            pc = decode(dcontext, pc, &instr);
            if (instr_get_opcode(&instr) == IF_X86_ELSE(OP_mov_imm, OP_mov) &&
                opnd_is_reg(instr_get_dst(&instr, 0)) &&
                opnd_get_reg(instr_get_dst(&instr, 0)) ==
                IF_X86_ELSE(REG_EAX /* must be EAX not XAX! */, DR_REG_R7) &&
                opnd_is_immed_int(instr_get_src(&instr, 0))) {
                sysnum = (int) opnd_get_immed_int(instr_get_src(&instr, 0));
                /* don't break: find last one before syscall */
            }
        } while (pc != NULL && instr_valid(&instr) && !instr_is_syscall(&instr) &&
                 pc < FCACHE_ENTRY_PC(f) + f->size);
        instr_free(dcontext, &instr);
        ASSERT(DYNAMO_OPTION(ignore_syscalls));
        ASSERT(sysnum > -1);
   } else {
        /* do_syscall => eax should be in mcontext */
        sysnum = (int) MCXT_SYSNUM_REG(get_mcontext(dcontext));
    }
    LOG(THREAD, LOG_ASYNCH, 2, "%s: syscall # is %d\n", __FUNCTION__, sysnum);
    if (sysnum_is_not_restartable(sysnum)) {
        LOG(THREAD, LOG_ASYNCH, 2, "%s: syscall is non-restartable\n", __FUNCTION__);
        return false;
    }
    sc->SC_SYSNUM_REG = sysnum;

    /* Now adjust the pc to point at the syscall instruction instead of after it,
     * so when we resume we'll go back to the syscall.
     *
     * XXX: this is a transparency issue: the app might expect a pc after the
     * syscall.  We live with it for now.
     */
#ifdef X86
    ASSERT(INT_LENGTH == SYSCALL_LENGTH &&
           INT_LENGTH == SYSENTER_LENGTH);
    if (pc == vsyscall_sysenter_return_pc) {
        sc->SC_XIP = (ptr_uint_t) (vsyscall_syscall_end_pc - SYSENTER_LENGTH);
        /* To restart sysenter we must re-copy xsp into xbp, as xbp is
         * clobbered by the kernel.
         */
        sc->SC_XBP = sc->SC_XSP;
    } else if (is_after_syscall_address(dcontext, pc)) {
        /* We're at do_syscall: point at app syscall instr */
        sc->SC_XIP = (ptr_uint_t) (dcontext->asynch_target - INT_LENGTH);
        DODEBUG({
            instr_init(dcontext, &instr);
            ASSERT(decode(dcontext, (app_pc) sc->SC_XIP, &instr) != NULL &&
                   instr_is_syscall(&instr));
            instr_free(dcontext, &instr);
        });
    } else {
        instr_init(dcontext, &instr);
        pc = decode(dcontext, pc - INT_LENGTH, &instr);
        if (instr_is_syscall(&instr))
            sc->SC_XIP -= INT_LENGTH;
        else
            ASSERT_NOT_REACHED();
        instr_free(dcontext, &instr);
    }
#elif defined(ARM)
    int svc_length = dr_get_isa_mode(dcontext) == DR_ISA_ARM_THUMB ?
        SVC_THUMB_LENGTH : SVC_ARM_LENGTH;
    instr_init(dcontext, &instr);
    pc = decode(dcontext, pc - svc_length, &instr);
    if (instr_is_syscall(&instr))
        sc->SC_XIP -= svc_length;
    else
        ASSERT_NOT_REACHED();
    instr_free(dcontext, &instr);
#endif /* X86/ARM */
    LOG(THREAD, LOG_ASYNCH, 2, "%s: sigreturn pc is now "PFX"\n", __FUNCTION__,
        sc->SC_XIP);
    return true;
}

static void
record_pending_signal(dcontext_t *dcontext, int sig, kernel_ucontext_t *ucxt,
                      sigframe_rt_t *frame, bool forged
                      _IF_CLIENT(byte *access_address))
{
    thread_sig_info_t *info = (thread_sig_info_t *) dcontext->signal_field;
    os_thread_data_t *ostd = (os_thread_data_t *) dcontext->os_field;
    sigcontext_t *sc = SIGCXT_FROM_UCXT(ucxt);
    sigcontext_t sc_orig;
    byte *pc = (byte *) sc->SC_XIP;
    byte *xsp = (byte*) sc->SC_XSP;
    bool receive_now = false;
    bool blocked = false;
    bool handled = false;
    bool at_syscall = false;
    sigpending_t *pend;
    fragment_t *f = NULL;
    fragment_t wrapper;

    /* We no longer block SUSPEND_SIGNAL (i#184/PR 450670) or SIGSEGV (i#193/PR 287309).
     * But we can have re-entrancy issues in this routine if the app uses the same
     * SUSPEND_SIGNAL, or the nested SIGSEGV needs to be sent to the app.  The
     * latter shouldn't happen unless the app sends SIGSEGV via SYS_kill().
     */
    if (ostd->processing_signal > 0 ||
        /* If we interrupted receive_pending_signal() we can't prepend a new
         * pending or delete an old b/c we might mess up the state so we
         * just drop this one: should only happen for alarm signal
         */
        (info->accessing_sigpending &&
         /* we do want to report a crash in receive_pending_signal() */
         (can_always_delay[sig] ||
          is_sys_kill(dcontext, pc, (byte*)sc->SC_XSP, &frame->info)))) {
        LOG(THREAD, LOG_ASYNCH, 1, "nested signal %d\n", sig);
        ASSERT(ostd->processing_signal == 0 || sig == SUSPEND_SIGNAL || sig == SIGSEGV);
        ASSERT(can_always_delay[sig] ||
               is_sys_kill(dcontext, pc, (byte*)sc->SC_XSP, &frame->info));
        /* To avoid re-entrant execution of special_heap_alloc() and of
         * prepending to the pending list we just drop this signal.
         * FIXME i#194/PR 453996: do better.
         */
        STATS_INC(num_signals_dropped);
        SYSLOG_INTERNAL_WARNING_ONCE("dropping nested signal");
        return;
    }
    ostd->processing_signal++; /* no need for atomicity: thread-private */

    /* First, check whether blocked, before we restore for sigsuspend (i#1340). */
    if (kernel_sigismember(&info->app_sigblocked, sig))
        blocked = true;

    if (info->in_sigsuspend) {
        /* sigsuspend ends when a signal is received, so restore the
         * old blocked set
         */
        info->app_sigblocked = info->app_sigblocked_save;
        info->in_sigsuspend = false;
        /* update the set to restore to post-signal-delivery */
#ifdef MACOS
        ucxt->uc_sigmask = *(__darwin_sigset_t *) &info->app_sigblocked;
#else
        ucxt->uc_sigmask = info->app_sigblocked;
#endif
#ifdef DEBUG
        if (stats->loglevel >= 3 && (stats->logmask & LOG_ASYNCH) != 0) {
            LOG(THREAD, LOG_ASYNCH, 3, "after sigsuspend, blocked signals are now:\n");
            dump_sigset(dcontext, &info->app_sigblocked);
        }
#endif
    }

    if (info->app_sigaction[sig] != NULL &&
        info->app_sigaction[sig]->handler == (handler_t)SIG_IGN
        /* If a client registered a handler, put this in the queue.
         * Races between registering, queueing, and delivering are fine.
         */
        IF_CLIENT_INTERFACE(&& !dr_signal_hook_exists())) {
        LOG(THREAD, LOG_ASYNCH, 3,
            "record_pending_signal (%d at pc "PFX"): action is SIG_IGN!\n",
            sig, pc);
        ostd->processing_signal--;
        return;
    } else if (blocked) {
        /* signal is blocked by app, so just record it, don't receive now */
        LOG(THREAD, LOG_ASYNCH, 2,
            "record_pending_signal(%d at pc "PFX"): signal is currently blocked\n",
            sig, pc);
        IF_LINUX(handled = notify_signalfd(dcontext, info, sig, frame));
    } else if (safe_is_in_fcache(dcontext, pc, xsp)) {
        LOG(THREAD, LOG_ASYNCH, 2,
            "record_pending_signal(%d) from cache pc "PFX"\n", sig, pc);
        if (forged || can_always_delay[sig]) {
            /* to make translation easier, want to delay if can until dispatch
             * unlink cur frag, wait for dispatch
             */
            /* check for coarse first to avoid cost of coarse pclookup */
            if (get_fcache_coarse_info(pc) != NULL) {
                /* PR 213040: we can't unlink coarse.  If we fail to translate
                 * we'll switch back to delaying, below.
                 */
                if (sig_is_alarm_signal(sig) &&
                    info->sigpending[sig] != NULL &&
                    info->sigpending[sig]->next != NULL &&
                    info->skip_alarm_xl8 > 0) {
                    /* Translating coarse fragments is very expensive so we
                     * avoid doing it when we're having trouble keeping up w/
                     * the alarm frequency (PR 213040), but we make sure we try
                     * every once in a while to avoid unbounded signal delay
                     */
                    info->skip_alarm_xl8--;
                    STATS_INC(num_signals_coarse_delayed);
                } else {
                    if (sig_is_alarm_signal(sig))
                        info->skip_alarm_xl8 = SKIP_ALARM_XL8_MAX;
                    receive_now = true;
                    LOG(THREAD, LOG_ASYNCH, 2,
                        "signal interrupted coarse fragment so delivering now\n");
                }
            } else {
                f = fragment_pclookup(dcontext, pc, &wrapper);
                ASSERT(f != NULL);
                ASSERT(!TEST(FRAG_COARSE_GRAIN, f->flags)); /* checked above */
                LOG(THREAD, LOG_ASYNCH, 2, "\tdelaying until exit F%d\n", f->id);
                if (interrupted_inlined_syscall(dcontext, f, pc)) {
                    /* PR 596147: if delayable signal arrives after syscall-skipping
                     * jmp, either at syscall or post-syscall, we deliver
                     * immediately, since we can't bound the delay
                     */
                    receive_now = true;
                    LOG(THREAD, LOG_ASYNCH, 2,
                        "signal interrupted pre/post syscall itself so delivering now\n");
                    at_syscall = true;
                } else {
                    /* could get another signal but should be in same fragment */
                    ASSERT(info->interrupted == NULL || info->interrupted == f);
                    if (unlink_fragment_for_signal(dcontext, f, pc)) {
                        info->interrupted = f;
                        info->interrupted_pc = pc;
                    } else {
                        /* either was unlinked for trace creation, or we got another
                         * signal before exiting cache to handle 1st
                         */
                        ASSERT(info->interrupted == NULL ||
                               info->interrupted == f);
                    }
                }
            }
        } else {
            /* the signal interrupted code cache => run handler now! */
            receive_now = true;
            LOG(THREAD, LOG_ASYNCH, 2, "\tnot certain can delay so handling now\n");
        }
    } else if (in_generated_routine(dcontext, pc) ||
               /* XXX: should also check fine stubs */
               safe_is_in_coarse_stubs(dcontext, pc, xsp)) {
        /* Assumption: dynamo errors have been caught already inside
         * the master_signal_handler, thus any error in a generated routine
         * is an asynch signal that can be delayed
         */
        /* FIXME: dispatch on routine:
         * if fcache_return, treat as dynamo
         * if fcache_enter, unlink next frag, treat as dynamo
         *   what if next frag has syscall in it?
         * if indirect_branch_lookup prior to getting target...?!?
         */
        LOG(THREAD, LOG_ASYNCH, 2,
            "record_pending_signal(%d) from gen routine or stub "PFX"\n", sig, pc);
        /* i#1206: the syscall was interrupted, so we can go back to dispatch
         * and don't need to receive it now (which complicates post-syscall handling)
         * w/o any extra delay.
         */
        at_syscall = is_after_syscall_address(dcontext, pc);
        /* This could come from another thread's SYS_kill (via our gen do_syscall) */
        DOLOG(1, LOG_ASYNCH, {
            if (!is_after_syscall_address(dcontext, pc) &&
                !forged && !can_always_delay[sig]) {
                LOG(THREAD, LOG_ASYNCH, 1,
                    "WARNING: signal %d in gen routine: may cause problems!\n", sig);
            }
        });
    } else if (pc == vsyscall_sysenter_return_pc) {
        LOG(THREAD, LOG_ASYNCH, 2,
            "record_pending_signal(%d) from vsyscall "PFX"\n", sig, pc);
        /* i#1206: the syscall was interrupted, so we can go back to dispatch
         * and don't need to receive it now (which complicates post-syscall handling)
         */
        at_syscall = true;
    } else {
        /* the signal interrupted dynamo => do not run handler now! */
        LOG(THREAD, LOG_ASYNCH, 2,
            "record_pending_signal(%d) from dynamo or lib at pc "PFX"\n", sig, pc);
        if (!forged &&
            !can_always_delay[sig] &&
            !is_sys_kill(dcontext, pc, (byte*)sc->SC_XSP, &frame->info)) {
            /* i#195/PR 453964: don't re-execute if will just re-fault.
             * Our checks for dstack, etc. in master_signal_handler should
             * have accounted for everything
             */
            ASSERT_NOT_REACHED();
            abort_on_DR_fault(dcontext, pc, sc,
                              (sig == SIGSEGV) ? "SEGV" : "other", "unknown");
        }
    }

    LOG(THREAD, LOG_ASYNCH, 3, "\taction is not SIG_IGN\n");
#ifdef LINUX
    LOG(THREAD, LOG_ASYNCH, 3, "\tretaddr = "PFX"\n",
        frame->pretcode); /* pretcode has same offs for plain */
#endif

    if (receive_now) {
        /* we need to translate sc before we know whether client wants to
         * suppress, so we need a backup copy
         */
        bool xl8_success;

        /* i#1145: update the context for an auto-restart syscall
         * before we make the sc_orig copy or translate.
         */
        if (at_syscall)
            adjust_syscall_for_restart(dcontext, info, sig, sc, f);

        sc_orig = *sc;
        ASSERT(!forged);
        /* cache the fragment since pclookup is expensive for coarse (i#658) */
        f = fragment_pclookup(dcontext, (cache_pc)sc->SC_XIP, &wrapper);
        xl8_success = translate_sigcontext(dcontext, sc, !can_always_delay[sig], f);

        if (can_always_delay[sig] && !xl8_success) {
            /* delay: we expect this for coarse fragments if alarm arrives
             * in middle of ind branch region or sthg (PR 213040)
             */
            LOG(THREAD, LOG_ASYNCH, 2,
                "signal is in un-translatable spot in coarse fragment: delaying\n");
            receive_now = false;
        }
    }

    if (receive_now) {

        /* N.B.: since we abandon the old context for synchronous signals,
         * we do not need to mark this fragment as FRAG_CANNOT_DELETE
         */
#ifdef DEBUG
        if (stats->loglevel >= 2 && (stats->logmask & LOG_ASYNCH) != 0 &&
            safe_is_in_fcache(dcontext, pc, xsp)) {
            ASSERT(f != NULL);
            LOG(THREAD, LOG_ASYNCH, 2,
                "Got signal at pc "PFX" in this fragment:\n", pc);
            disassemble_fragment(dcontext, f, false);
        }
#endif

        LOG(THREAD, LOG_ASYNCH, 2, "Going to receive signal now\n");
        /* If we end up executing the default action, we'll go native
         * since we translated the context.  If there's a handler,
         * we'll copy the context to the app stack and then adjust the
         * original on our stack so we take over.
         */
        execute_handler_from_cache(dcontext, sig, frame, &sc_orig, f
                                   _IF_CLIENT(access_address));

    } else if (!handled) {

#ifdef CLIENT_INTERFACE
        /* i#182/PR 449996: must let client act on blocked non-delayable signals to
         * handle instrumentation faults.  Make sure we're at a safe spot: i.e.,
         * only raise for in-cache faults.  Checking forged and no-delay
         * to avoid the in-cache check for delayable signals => safer.
         */
        if (blocked && !forged && !can_always_delay[sig] &&
            safe_is_in_fcache(dcontext, pc, xsp)) {
            dr_signal_action_t action;
            /* cache the fragment since pclookup is expensive for coarse (i#658) */
            f = fragment_pclookup(dcontext, (cache_pc)sc->SC_XIP, &wrapper);
            sc_orig = *sc;
            translate_sigcontext(dcontext, sc, true/*shouldn't fail*/, f);
            action = send_signal_to_client(dcontext, sig, frame, &sc_orig,
                                           access_address, true/*blocked*/, f);
            /* For blocked signal early event we disallow BYPASS (xref i#182/PR 449996) */
            CLIENT_ASSERT(action != DR_SIGNAL_BYPASS,
                          "cannot bypass a blocked signal event");
            if (!handle_client_action_from_cache(dcontext, sig, action, frame,
                                                 &sc_orig, true/*blocked*/)) {
                ostd->processing_signal--;
                return;
            }
            /* restore original (untranslated) sc */
            *get_sigcontext_from_rt_frame(frame) = sc_orig;
        }
#endif

        /* i#196/PR 453847: avoid infinite loop of signals if try to re-execute */
        if (blocked && !forged && !can_always_delay[sig] &&
            !is_sys_kill(dcontext, pc, (byte*)sc->SC_XSP, &frame->info)) {
            ASSERT(default_action[sig] == DEFAULT_TERMINATE ||
                   default_action[sig] == DEFAULT_TERMINATE_CORE);
            LOG(THREAD, LOG_ASYNCH, 1,
                "blocked fatal signal %d cannot be delayed: terminating\n", sig);
            sc_orig = *sc;
            translate_sigcontext(dcontext, sc, true/*shouldn't fail*/, NULL);
            /* the process should be terminated */
            execute_default_from_cache(dcontext, sig, frame, &sc_orig);
            ASSERT_NOT_REACHED();
        }

        /* Happened in DR, do not translate context.  Record for later processing
         * at a safe point with a clean app state.
         */
        if (!blocked || sig >= OFFS_RT ||
            (blocked && info->sigpending[sig] == NULL)) {
            /* only have 1 pending for blocked non-rt signals */

            /* special heap alloc always uses sizeof(sigpending_t) blocks */
            pend = special_heap_alloc(info->sigheap);
            ASSERT(sig > 0 && sig <= MAX_SIGNUM);

            /* to avoid accumulating signals if we're slow in presence of
             * a high-rate itimer we only keep 2 alarm signals (PR 596768)
             */
            if (sig_is_alarm_signal(sig)) {
                if (info->sigpending[sig] != NULL &&
                    info->sigpending[sig]->next != NULL) {
                    ASSERT(info->sigpending[sig]->next->next == NULL);
                    /* keep the oldest, replace newer w/ brand-new one, for
                     * more spread-out alarms
                     */
                     sigpending_t *temp = info->sigpending[sig];
                     info->sigpending[sig] = temp->next;
                     special_heap_free(info->sigheap, temp);
                     LOG(THREAD, LOG_ASYNCH, 2,
                         "3rd pending alarm %d => dropping 2nd\n", sig);
                     STATS_INC(num_signals_dropped);
                     SYSLOG_INTERNAL_WARNING_ONCE("dropping 3rd pending alarm signal");
                }
            }

            pend->next = info->sigpending[sig];
            info->sigpending[sig] = pend;
            pend->unblocked = !blocked;

            /* FIXME: note that for asynchronous signals we don't need to
             *  bother to record exact machine context, even entire frame,
             *  since don't want to pass dynamo pc context to app handler.
             *  only copy frame for synchronous signals?  those only
             *  happen while in cache?  but for asynch, we would have to
             *  construct our own frame...kind of a pain.
             */
            copy_frame_to_pending(dcontext, sig, frame _IF_CLIENT(access_address));

            /* i#1145: check whether we should auto-restart an interrupted syscall */
            if (at_syscall) {
                /* Adjust the pending frame to restart the syscall, if applicable */
                sigframe_rt_t *frame = &(info->sigpending[sig]->rt_frame);
                sigcontext_t *sc_pend = get_sigcontext_from_rt_frame(frame);
                if (adjust_syscall_for_restart(dcontext, info, sig, sc_pend, f)) {
                    /* We're going to re-start this syscall after we go
                     * back to dispatch, run the post-syscall handler (for -EINTR),
                     * and deliver the signal.  We've adjusted the sigcontext
                     * for re-start on the sigreturn, but we need to tell
                     * execute_handler_from_dispatch() to use our sigcontext
                     * and not the mcontext.
                     * A client will see a second set of pre + post handlers for
                     * the restart, which seems reasonable, given the signal in
                     * between.
                     */
                    info->sigpending[sig]->use_sigcontext = true;
                }
            }

       } else {
            /* For clients, we document that we do not pass to them
             * unless we're prepared to deliver to app.  We would have
             * to change our model to pass them non-final-translated
             * contexts for delayable signals in order to give them
             * signals as soon as they come in.  Xref i#182/PR 449996.
             */
            LOG(THREAD, LOG_ASYNCH, 3,
                "\tnon-rt signal already in queue, ignoring this one!\n");
        }

        if (!blocked)
            dcontext->signals_pending = true;
    }
    ostd->processing_signal--;
}

/* Distinguish SYS_kill-generated from instruction-generated signals.
 * If sent from another process we can't tell, but if sent from this
 * thread the interruption point should be our own post-syscall.
 * FIXME PR 368277: for other threads in same process we should set a flag
 * and identify them as well.
 * FIXME: for faults like SIGILL we could examine the interrupted pc
 * to see whether it is capable of generating such a fault (see code
 * used in handle_nudge_signal()).
 */
static bool
is_sys_kill(dcontext_t *dcontext, byte *pc, byte *xsp, siginfo_t *info)
{
#ifndef VMX86_SERVER /* does not properly set si_code */
    /* i#133: use si_code to distinguish user-sent signals.
     * Even 2.2 Linux kernel supports <=0 meaning user-sent (except
     * SIGIO) so we assume we can rely on it.
     */
    if (info->si_code <= 0)
        return true;
#endif
    return (is_at_do_syscall(dcontext, pc, xsp) &&
            (dcontext->sys_num == SYS_kill ||
#ifdef LINUX
             dcontext->sys_num == SYS_tkill ||
             dcontext->sys_num == SYS_tgkill ||
             dcontext->sys_num == SYS_rt_sigqueueinfo
#elif defined (MACOS)
             dcontext->sys_num == SYS___pthread_kill
#endif
             ));
}

static byte *
compute_memory_target(dcontext_t *dcontext, cache_pc instr_cache_pc,
                      sigcontext_t *sc, siginfo_t *si, bool *write)
{
    byte *target = NULL;
    instr_t instr;
    priv_mcontext_t mc;
    uint memopidx, memoppos, memopsize;
    opnd_t memop;
    bool found_target = false;
    bool in_maps;
    bool use_allmem = false;
    uint prot;

    LOG(THREAD, LOG_ALL, 2,
        "computing memory target for "PFX" causing SIGSEGV, kernel claims it is "PFX"\n",
        instr_cache_pc, (byte*)si->si_addr);

    /* We used to do a memory query to check if instr_cache_pc is readable, but
     * now we use TRY/EXCEPT because we don't have the instr length and the OS
     * query is expensive.  If decoding faults, the signal handler will longjmp
     * out before it calls us recursively.
     */
    instr_init(dcontext, &instr);
    TRY_EXCEPT(dcontext, {
        decode(dcontext, instr_cache_pc, &instr);
    }, {
        return NULL;  /* instr_cache_pc was unreadable */
    });

    if (!instr_valid(&instr)) {
        LOG(THREAD, LOG_ALL, 2,
            "WARNING: got SIGSEGV for invalid instr at cache pc "PFX"\n", instr_cache_pc);
        ASSERT_NOT_REACHED();
        instr_free(dcontext, &instr);
        return NULL;
    }

    sigcontext_to_mcontext(&mc, sc);
    ASSERT(write != NULL);

    /* i#1009: If si_addr is plausibly one of the memory operands of the
     * faulting instruction, assume the target was si_addr.  If none of the
     * memops match, fall back to checking page protections, which can be racy.
     * For si_addr == NULL, we fall back to the protection check because it's
     * too likely to be a valid memop and we can live with a race on a page that
     * is typically unmapped.
     */
    if (si->si_code == SEGV_ACCERR && si->si_addr != NULL) {
        for (memopidx = 0;
             instr_compute_address_ex_priv(&instr, &mc, memopidx,
                                           &target, write, &memoppos);
             memopidx++) {
            /* i#1045: check whether operand and si_addr overlap */
            memop = *write ? instr_get_dst(&instr, memoppos) :
                instr_get_src(&instr, memoppos);
            memopsize = opnd_size_in_bytes(opnd_get_size(memop));
            LOG(THREAD, LOG_ALL, 2,
                "memory operand %u has address "PFX" and size %u\n",
                memopidx, target, memopsize);
            if ((byte*)si->si_addr >= target &&
                (byte*)si->si_addr < target + memopsize) {
                target = (byte*)si->si_addr;
                found_target = true;
                break;
            }
        }
    }

    /* For fcache faults, use all_memory_areas, which is faster but acquires
     * locks.  If it's possible we're in DR, go to the OS to avoid deadlock.
     */
    if (DYNAMO_OPTION(use_all_memory_areas)) {
        use_allmem = safe_is_in_fcache(dcontext, instr_cache_pc,
                                       (byte *)sc->SC_XSP);
    }
    if (!found_target) {
        if (si->si_addr != NULL) {
            LOG(THREAD, LOG_ALL, 3,
                "%s: falling back to racy protection checks\n", __FUNCTION__);
        }
        /* i#115/PR 394984: consider all memops */
        for (memopidx = 0;
             instr_compute_address_ex_priv(&instr, &mc, memopidx,
                                           &target, write, NULL);
             memopidx++) {
            if (use_allmem) {
                in_maps = get_memory_info(target, NULL, NULL, &prot);
            } else {
                in_maps = get_memory_info_from_os(target, NULL, NULL, &prot);
            }
            if ((!in_maps || !TEST(MEMPROT_READ, prot)) ||
                (*write && !TEST(MEMPROT_WRITE, prot))) {
                found_target = true;
                break;
            }
        }
    }

    if (!found_target) {
        /* probably an NX fault: how tell whether kernel enforcing? */
        in_maps = get_memory_info_from_os(instr_cache_pc, NULL, NULL, &prot);
        if (!in_maps || !TEST(MEMPROT_EXEC, prot)) {
            target = instr_cache_pc;
            found_target = true;
        }
    }

    /* we may still not find target, e.g. for SYS_kill(SIGSEGV) */
    if (!found_target)
        target = NULL;
    DOLOG(2, LOG_ALL, {
        LOG(THREAD, LOG_ALL, 2,
            "For SIGSEGV at cache pc "PFX", computed target %s "PFX"\n",
            instr_cache_pc, *write ? "write" : "read", target);
        loginst(dcontext, 2, &instr, "\tfaulting instr");
    });
    instr_free(dcontext, &instr);
    return target;
}

/* If native_state is true, assumes the fault is not in the cache and thus
 * does not need translation but rather should always be re-executed.
 */
static bool
check_for_modified_code(dcontext_t *dcontext, cache_pc instr_cache_pc,
                        sigcontext_t *sc, byte *target, bool native_state)
{
    /* special case: we expect a seg fault for executable regions
     * that were writable and marked read-only by us.
     * have to figure out the target address!
     * unfortunately the OS doesn't tell us, nor whether it's a write.
     * FIXME: if sent from SYS_kill(SIGSEGV), the pc will be post-syscall,
     * and if that post-syscall instr is a write that could have faulted,
     * how can we tell the difference?
     */
    if (was_executable_area_writable(target)) {
        /* translate instr_cache_pc to original app pc
         * DO NOT use translate_sigcontext, don't want to change the
         * signal frame or else we'll lose control when we try to
         * return to signal pc!
         */
        app_pc next_pc, translated_pc = NULL;
        fragment_t *f = NULL;
        fragment_t wrapper;
        ASSERT((cache_pc)sc->SC_XIP == instr_cache_pc);
        if (!native_state) {
            /* For safe recreation we need to either be couldbelinking or hold
             * the initexit lock (to keep someone from flushing current
             * fragment), the initexit lock is easier
             */
            mutex_lock(&thread_initexit_lock);
            /* cache the fragment since pclookup is expensive for coarse units (i#658) */
            f = fragment_pclookup(dcontext, instr_cache_pc, &wrapper);
            translated_pc = recreate_app_pc(dcontext, instr_cache_pc, f);
            ASSERT(translated_pc != NULL);
            mutex_unlock(&thread_initexit_lock);
        }

        next_pc =
            handle_modified_code(dcontext, instr_cache_pc, translated_pc,
                                 target, f);

        if (!native_state) {
            /* going to exit from middle of fragment (at the write) so will mess up
             * trace building
             */
            if (is_building_trace(dcontext)) {
                LOG(THREAD, LOG_ASYNCH, 3, "\tsquashing trace-in-progress\n");
                trace_abort(dcontext);
            }
        }

        if (next_pc == NULL) {
            /* re-execute the write -- just have master_signal_handler return */
            return true;
        } else {
            ASSERT(!native_state);
            /* Do not resume execution in cache, go back to dispatch. */
            transfer_from_sig_handler_to_fcache_return(dcontext, sc, next_pc,
                                      (linkstub_t *) get_selfmod_linkstub());
            /* now have master_signal_handler return */
            return true;
        }
    }
    return false;
}

#ifndef HAVE_SIGALTSTACK
/* The exact layout of this struct is relied on in master_signal_handler()
 * in x86.asm.
 */
struct clone_and_swap_args {
    byte *stack;
    byte *tos;
};

/* Helper function for swapping handler to dstack */
bool
sig_should_swap_stack(struct clone_and_swap_args *args, kernel_ucontext_t *ucxt)
{
    byte *cur_esp;
    dcontext_t *dcontext = get_thread_private_dcontext();
    if (dcontext == NULL)
        return false;
    GET_STACK_PTR(cur_esp);
    if (!is_on_dstack(dcontext, cur_esp)) {
        sigcontext_t *sc = SIGCXT_FROM_UCXT(ucxt);
        /* Pass back the proper args to clone_and_swap_stack: we want to
         * copy to dstack from the tos at the signal interruption point.
         */
        args->stack = dcontext->dstack;
        /* leave room for fpstate */
        args->stack -= XSTATE_FRAME_EXTRA;
        args->stack = (byte *) ALIGN_BACKWARD(args->stack, XSTATE_ALIGNMENT);
        args->tos = (byte *) sc->SC_XSP;
        return true;
    } else
        return false;
}
#endif

/* Helper that takes over the current thread signaled via SUSPEND_SIGNAL.  Kept
 * separate mostly to keep the priv_mcontext_t allocation out of
 * master_signal_handler_C.
 */
static void
sig_take_over(sigcontext_t *sc)
{
    priv_mcontext_t mc;
    sigcontext_to_mcontext(&mc, sc);
    os_thread_take_over(&mc);
    ASSERT_NOT_REACHED();
}

static bool
is_safe_read_ucxt(kernel_ucontext_t *ucxt)
{
    app_pc pc = (app_pc) SIGCXT_FROM_UCXT(ucxt)->SC_XIP;
    return is_safe_read_pc(pc);
}

/* the master signal handler
 * WARNING: behavior varies with different versions of the kernel!
 * sigaction support was only added with 2.2
 */
#ifdef X64
/* stub in x86.asm passes our xsp to us */
# ifdef MACOS
void
master_signal_handler_C(handler_t handler, int style, int sig, siginfo_t *info,
                        kernel_ucontext_t *ucxt, byte *xsp)
# else
void
master_signal_handler_C(int sig, siginfo_t *siginfo, kernel_ucontext_t *ucxt,
                        byte *xsp)
# endif
#else
/* On ia32, adding a parameter disturbs the frame we're trying to capture, so we
 * add an intermediate frame and read the normal params off the stack directly.
 */
void
master_signal_handler_C(byte *xsp)
#endif
{
    sigframe_rt_t *frame = (sigframe_rt_t *) xsp;
#ifndef X64
    /* Read the normal arguments from the frame. */
    int sig = frame->sig;
    siginfo_t *siginfo = frame->pinfo;
    kernel_ucontext_t *ucxt = frame->puc;
#endif /* !X64 */
    sigcontext_t *sc = SIGCXT_FROM_UCXT(ucxt);
#ifdef DEBUG
    uint level = 2;
# if !defined(HAVE_MEMINFO)
    /* avoid logging every single TRY probe fault */
    if (!dynamo_initialized)
        level = 5;
# endif
#endif
    bool local;
#if defined(MACOS) && !defined(X64)
    /* The kernel clears fs, so we have to re-instate our selector, if
     * it was set in the first place.
     */
    if (sc->__ss.__fs != 0)
        tls_reinstate_selector(sc->__ss.__fs);
#endif
    dcontext_t *dcontext = get_thread_private_dcontext();

#ifdef MACOS
# ifdef X64
    ASSERT((YMM_ENABLED() && ucxt->uc_mcsize == sizeof(_STRUCT_MCONTEXT_AVX64)) ||
           (!YMM_ENABLED() && ucxt->uc_mcsize == sizeof(_STRUCT_MCONTEXT64)));
# else
    ASSERT((YMM_ENABLED() && ucxt->uc_mcsize == sizeof(_STRUCT_MCONTEXT_AVX32)) ||
           (!YMM_ENABLED() && ucxt->uc_mcsize == sizeof(_STRUCT_MCONTEXT)));
# endif
#endif

    /* i#350: To support safe_read or TRY_EXCEPT without a dcontext, use the
     * global dcontext
     * when handling safe_read faults.  This lets us pass the check for a
     * dcontext below and causes us to use the global log.
     */
    if (dcontext == NULL && (sig == SIGSEGV || sig == SIGBUS) &&
        (is_safe_read_ucxt(ucxt) ||
         (!dynamo_initialized && global_try_except.try_except_state != NULL))) {
        dcontext = GLOBAL_DCONTEXT;
    }

    if (dynamo_exited && get_num_threads() > 1 && sig == SIGSEGV) {
        /* PR 470957: this is almost certainly a race so just squelch it.
         * We live w/ the risk that it was holding a lock our release-build
         * exit code needs.
         */
        exit_thread_syscall(1);
    }
    /* FIXME: ensure the path for recording a pending signal does not grab any DR locks
     * that could have been interrupted
     * e.g., synchronize_dynamic_options grabs the stats_lock!
     */
    if (dcontext == NULL ||
        (dcontext != GLOBAL_DCONTEXT &&
         (dcontext->signal_field == NULL ||
          !((thread_sig_info_t*)dcontext->signal_field)->fully_initialized))) {
        /* FIXME: || !intercept_asynch, or maybe !under_our_control */
        /* FIXME i#26: this could be a signal arbitrarily sent to this thread.
         * We could try to route it to another thread, using a global queue
         * of pending signals.  But what if it was targeted to this thread
         * via SYS_{tgkill,tkill}?  Can we tell the difference, even if
         * we watch the kill syscalls: could come from another process?
         */
        if (sig_is_alarm_signal(sig)) {
            /* assuming an alarm during thread exit or init (xref PR 596127,
             * i#359): suppressing is fine
             */
        } else if (sig == SUSPEND_SIGNAL && dcontext == NULL) {
            /* We sent SUSPEND_SIGNAL to a thread we don't control (no
             * dcontext), which means we want to take over.
             */
            sig_take_over(sc);  /* no return */
            ASSERT_NOT_REACHED();
        } else {
            /* Using global dcontext because dcontext is NULL here. */
            DOLOG(1, LOG_ASYNCH, { dump_sigcontext(GLOBAL_DCONTEXT, sc); });
            SYSLOG_INTERNAL_ERROR("ERROR: master_signal_handler with no siginfo "
                                  "(i#26?): tid=%d, sig=%d", get_sys_thread_id(), sig);
        }
        /* see FIXME comments above.
         * workaround for now: suppressing is better than dying.
         */
        if (can_always_delay[sig])
            return;
        else
            exit_process_syscall(1);
    }

    /* we may be entering dynamo from code cache! */
    /* Note that this is unsafe if -single_thread_in_DR => we grab a lock =>
     * hang if signal interrupts DR: but we don't really support that option
     */
    ENTERING_DR();
    if (dcontext == GLOBAL_DCONTEXT) {
        local = false;
    } else {
        local = local_heap_protected(dcontext);
        if (local)
            SELF_PROTECT_LOCAL(dcontext, WRITABLE);
    }

    LOG(THREAD, LOG_ASYNCH, level, "\nmaster_signal_handler: sig=%d, retaddr="PFX"\n",
        sig, *((byte **)xsp));
    LOG(THREAD, LOG_ASYNCH, level+1,
        "siginfo: pid = %d, status = %d, errno = %d, si_code = %d\n",
        siginfo->si_pid, siginfo->si_status, siginfo->si_errno,
        siginfo->si_code);
    DOLOG(level+1, LOG_ASYNCH, { dump_sigcontext(dcontext, sc); });

#if !defined(X64) && !defined(VMX86_SERVER) && defined(LINUX)
    /* FIXME case 6700: 2.6.9 (FC3) kernel sets up our frame with a pretcode
     * of 0x440.  This happens if our restorer is unspecified (though 2.6.9
     * src code shows setting the restorer to a default value in that case...)
     * or if we explicitly point at dynamorio_sigreturn.  I couldn't figure
     * out why it kept putting 0x440 there.  So we fix the issue w/ this
     * hardcoded return.
     * This hack causes vmkernel to kill the process on sigreturn due to
     * vmkernel's non-standard sigreturn semantics.  PR 404712.
     */
    *((byte **)xsp) = (byte *) dynamorio_sigreturn;
#endif

    /* N.B.:
     * ucontext_t is defined in two different places.  The one we get
     * included is /usr/include/sys/ucontext.h, which would have us
     * doing this:
     *     void *pc = (void *) ucxt->uc_mcontext.gregs[EIP];
     * However, EIP is not defined for us (used to be in older
     * RedHat version) unless we define __USE_GNU, which we don't want to do
     * for other reasons, so we'd have to also say:
     *     #define EIP 14
     * Instead we go by the ucontext_t definition in
     * /usr/include/asm/ucontext.h, which has it containing a sigcontext struct,
     * defined in /usr/include/asm/sigcontext.h.  This is the definition used
     * by the kernel.  The two definitions are field-for-field
     * identical except that the sys one has an fpstate struct at the end --
     * but the next field in the frame is an fpstate.  The only mystery
     * is why the rt frame is declared as ucontext instead of sigcontext.
     * The kernel's version of ucontext must be the asm one!
     * And the sys one grabs the next field of the frame.
     * Also note that mcontext_t.fpregs == sigcontext.fpstate is NULL if
     * floating point operations have not been used (lazy fp state saving).
     * Also, sigset_t has different sizes according to kernel (8 bytes) vs.
     * glibc (128 bytes?).
     */

    switch (sig) {

    case SIGBUS: /* PR 313665: look for DR crashes on unaligned memory or mmap bounds */
    case SIGSEGV: {
        /* Older kernels do NOT fill out the signal-specific fields of siginfo,
         * except for SIGCHLD.  Thus we cannot do this:
         *     void *pc = (void*) siginfo->si_addr;
         * Thus we must use the third argument, which is a ucontext_t (see above)
         */
        sigcontext_t *sc = SIGCXT_FROM_UCXT(ucxt);
        void *pc = (void *) sc->SC_XIP;
        bool syscall_signal = false; /* signal came from syscall? */
        bool is_write = false;
        byte *target;
        bool is_DR_exception = false;

#ifdef SIDELINE
        if (dcontext == NULL) {
            SYSLOG_INTERNAL_ERROR("seg fault in sideline thread -- NULL dcontext!");
            ASSERT_NOT_REACHED();
        }
#endif
        if (is_safe_read_ucxt(ucxt) ||
            (!dynamo_initialized && global_try_except.try_except_state != NULL) ||
            dcontext->try_except.try_except_state != NULL) {
            /* handle our own TRY/EXCEPT */
            try_except_context_t *try_cxt;
#ifdef HAVE_MEMINFO
            /* our probe produces many of these every run */
            /* since we use for safe_*, making a _ONCE */
            SYSLOG_INTERNAL_WARNING_ONCE("(1+x) Handling our fault in a TRY at "PFX, pc);
#endif
            LOG(THREAD, LOG_ALL, level, "TRY fault at "PFX"\n", pc);
            if (TEST(DUMPCORE_TRY_EXCEPT, DYNAMO_OPTION(dumpcore_mask)))
                os_dump_core("try/except fault");

            if (is_safe_read_ucxt(ucxt)) {
                sc->SC_XIP = (reg_t) safe_read_resume_pc();
                /* Break out to log the normal return from the signal handler.
                 */
                break;
            }
            try_cxt = (dcontext != NULL) ? dcontext->try_except.try_except_state :
                global_try_except.try_except_state;
            ASSERT(try_cxt != NULL);

            /* The exception interception code did an ENTER so we must EXIT here */
            EXITING_DR();
            /* Since we have no sigreturn we have to restore the mask
             * manually, just like siglongjmp().  i#226/PR 492568: we rely
             * on the kernel storing the prior mask in ucxt, so we do not
             * need to store it on every setjmp.
             */
            /* Verify that there's no scenario where the mask gets changed prior
             * to a fault inside a try
             */
            ASSERT(memcmp(&try_cxt->context.sigmask,
                          &ucxt->uc_sigmask, sizeof(ucxt->uc_sigmask)) == 0);
            sigprocmask_syscall(SIG_SETMASK, SIGMASK_FROM_UCXT(ucxt), NULL,
                                sizeof(ucxt->uc_sigmask));
            DR_LONGJMP(&try_cxt->context, LONGJMP_EXCEPTION);
            ASSERT_NOT_REACHED();
        }

        target = compute_memory_target(dcontext, pc, sc, siginfo, &is_write);

#ifdef CLIENT_INTERFACE
        if (!IS_INTERNAL_STRING_OPTION_EMPTY(client_lib) && is_in_client_lib(pc)) {
            /* i#1354: client might write to a page we made read-only.
             * If so, handle the fault and re-execute it, if it's safe to do so
             * (we document these criteria under DR_MEMPROT_PRETEND_WRITE).
             */
            if (is_write && !is_couldbelinking(dcontext) &&
                OWN_NO_LOCKS(dcontext) &&
                check_for_modified_code(dcontext, pc, sc, target, true/*native*/))
                break;
            abort_on_fault(dcontext, DUMPCORE_CLIENT_EXCEPTION, pc, sc,
                           exception_label_client,  (sig == SIGSEGV) ? "SEGV" : "BUS",
                           " client library");
            ASSERT_NOT_REACHED();
        }
#endif

        /* For !HAVE_MEMINFO, we cannot compute the target until
         * after the try/except check b/c compute_memory_target()
         * calls get_memory_info_from_os() which does a probe: and the
         * try/except could be from a probe itself.  A try/except that
         * triggers a stack overflow should recover on the longjmp, so
         * this order should be fine.
         */

#ifdef STACK_GUARD_PAGE
        if (sig == SIGSEGV && is_write && is_stack_overflow(dcontext, target)) {
            SYSLOG_INTERNAL_CRITICAL(PRODUCT_NAME" stack overflow at pc "PFX, pc);
            /* options are already synchronized by the SYSLOG */
            if (TEST(DUMPCORE_INTERNAL_EXCEPTION, dynamo_options.dumpcore_mask))
                os_dump_core("stack overflow");
            os_terminate(dcontext, TERMINATE_PROCESS);
        }
#endif /* STACK_GUARD_PAGE */

        /* FIXME: share code with Windows callback.c */
        /* FIXME PR 205795: in_fcache and is_dynamo_address do grab locks! */
        if ((is_on_dstack(dcontext, (byte *)sc->SC_XSP)
             /* PR 302951: clean call arg processing => pass to app/client.
              * Rather than call the risky in_fcache we check whereami. */
             IF_CLIENT_INTERFACE(&& (dcontext->whereami != WHERE_FCACHE))) ||
            is_on_alt_stack(dcontext, (byte *)sc->SC_XSP) ||
            is_on_initstack((byte *)sc->SC_XSP)) {
            /* Checks here need to cover everything that record_pending_signal()
             * thinks is non-fcache, non-gencode: else that routine will kill
             * process since can't delay or re-execute (i#195/PR 453964).
             */
            is_DR_exception = true;
        } else if (!safe_is_in_fcache(dcontext, pc, (byte*)sc->SC_XSP) &&
                   (in_generated_routine(dcontext, pc) ||
                    is_at_do_syscall(dcontext, pc, (byte*)sc->SC_XSP) ||
                    is_dynamo_address(pc))) {
#ifdef CLIENT_INTERFACE
            if (!in_generated_routine(dcontext, pc) &&
                !is_at_do_syscall(dcontext, pc, (byte*)sc->SC_XSP)) {
                /* PR 451074: client needs a chance to handle exceptions in its
                 * own gencode.  client_exception_event() won't return if client
                 * wants to re-execute faulting instr.
                 */
                dr_signal_action_t action =
                    send_signal_to_client(dcontext, sig, frame, sc,
                                          target, false/*!blocked*/, NULL);
                if (action != DR_SIGNAL_DELIVER && /* for delivery, continue below */
                    !handle_client_action_from_cache(dcontext, sig, action, frame,
                                                     sc, false/*!blocked*/)) {
                    /* client handled fault */
                    break;
                }
            }
#endif
            is_DR_exception = true;
        }
        if (is_DR_exception) {
            /* kill(getpid(), SIGSEGV) looks just like a SIGSEGV in the store of eax
             * to mcontext after the syscall instr in do_syscall -- try to distinguish:
             */
            if (is_sys_kill(dcontext, pc, (byte*)sc->SC_XSP, siginfo)) {
                LOG(THREAD, LOG_ALL, 2,
                    "assuming SIGSEGV at post-do-syscall is kill, not our write fault\n");
                syscall_signal = true;
            }
            if (!syscall_signal) {
                if (check_in_last_thread_vm_area(dcontext, target)) {
                    /* See comments in callback.c as well.
                     * FIXME: try to share code
                     */
                    SYSLOG_INTERNAL_WARNING("(decode) exception in last area, "
                                            "DR pc="PFX", app pc="PFX, pc, target);
                    STATS_INC(num_exceptions_decode);
                    if (is_building_trace(dcontext)) {
                        LOG(THREAD, LOG_ASYNCH, 2, "intercept_exception: "
                                                   "squashing old trace\n");
                        trace_abort(dcontext);
                    }
                    /* we do get faults when not building a bb: e.g.,
                     * ret_after_call_check does decoding (case 9396) */
                    if (dcontext->bb_build_info != NULL) {
                        /* must have been building a bb at the time */
                        bb_build_abort(dcontext, true/*clean vm area*/, true/*unlock*/);
                    }
                    /* Since we have no sigreturn we have to restore the mask manually */
                    unblock_all_signals(NULL);
                    /* Let's pass it back to the application - memory is unreadable */
                    if (TEST(DUMPCORE_FORGE_UNREAD_EXEC, DYNAMO_OPTION(dumpcore_mask)))
                        os_dump_core("Warning: Racy app execution (decode unreadable)");
                    os_forge_exception(target, UNREADABLE_MEMORY_EXECUTION_EXCEPTION);
                    ASSERT_NOT_REACHED();
                } else {
                    abort_on_DR_fault(dcontext, pc, sc, (sig == SIGSEGV) ? "SEGV" : "BUS",
                                      in_generated_routine(dcontext, pc) ?
                                      " generated" : "");
                }
            }
        }
        /* if get here, pass the signal to the app */

        ASSERT(pc != 0); /* shouldn't get here */
        if (sig == SIGSEGV && !syscall_signal/*only for in-cache signals*/) {
            /* special case: we expect a seg fault for executable regions
             * that were writable and marked read-only by us.
             */
            if (is_write &&
                check_for_modified_code(dcontext, pc, sc, target, false/*!native*/)) {
                /* it was our signal, so don't pass to app -- return now */
                break;
            }
        }
        /* pass it to the application (or client) */
        LOG(THREAD, LOG_ALL, 1,
            "** Received SIG%s at cache pc "PFX" in thread "TIDFMT"\n",
            (sig == SIGSEGV) ? "SEGV" : "BUS", pc, get_thread_id());
        ASSERT(syscall_signal || safe_is_in_fcache(dcontext, pc, (byte *)sc->SC_XSP));
        /* we do not call trace_abort() here since we may need to
         * translate from a temp private bb (i#376): but all paths
         * that deliver the signal or redirect will call it
         */
        record_pending_signal(dcontext, sig, ucxt, frame, false _IF_CLIENT(target));
        break;
    }

    /* PR 212090: the signal we use to suspend threads */
    case SUSPEND_SIGNAL:
        if (handle_suspend_signal(dcontext, ucxt))
            record_pending_signal(dcontext, sig, ucxt, frame, false _IF_CLIENT(NULL));
        /* else, don't deliver to app */
        break;

    /* i#61/PR 211530: the signal we use for nudges */
    case NUDGESIG_SIGNUM:
        if (handle_nudge_signal(dcontext, siginfo, ucxt))
            record_pending_signal(dcontext, sig, ucxt, frame, false _IF_CLIENT(NULL));
        /* else, don't deliver to app */
        break;

    case SIGALRM:
    case SIGVTALRM:
    case SIGPROF:
        if (handle_alarm(dcontext, sig, ucxt))
            record_pending_signal(dcontext, sig, ucxt, frame, false _IF_CLIENT(NULL));
        /* else, don't deliver to app */
        break;

#ifdef SIDELINE
    case SIGCHLD: {
        int status = siginfo->si_status;
        if (siginfo->si_pid == 0) {
            /* FIXME: with older versions of linux the sigchld fields of
             * siginfo are not filled in properly!
             * This is my attempt to handle that, pid seems to be 0
             */
            break;
        }
        if (status != 0) {
            LOG(THREAD, LOG_ALL, 0, "*** Child thread died with error %d\n",
                status);
            ASSERT_NOT_REACHED();
        }
        break;
    }
#endif

    default: {
        record_pending_signal(dcontext, sig, ucxt, frame, false _IF_CLIENT(NULL));
        break;
    }
    } /* end switch */

    LOG(THREAD, LOG_ASYNCH, level, "\tmaster_signal_handler %d returning now\n\n", sig);

    /* restore protections */
    if (local)
        SELF_PROTECT_LOCAL(dcontext, READONLY);
    EXITING_DR();
}

static bool
execute_handler_from_cache(dcontext_t *dcontext, int sig, sigframe_rt_t *our_frame,
                           sigcontext_t *sc_orig, fragment_t *f
                           _IF_CLIENT(byte *access_address))
{
    thread_sig_info_t *info = (thread_sig_info_t *) dcontext->signal_field;
    /* we want to modify the sc in DR's frame */
    sigcontext_t *sc = get_sigcontext_from_rt_frame(our_frame);
    kernel_sigset_t blocked;
    /* Need to get xsp now before get new dcontext.
     * This is the translated xsp, so we avoid PR 306410 (cleancall arg fault
     * on dstack => handler run on dstack) that Windows hit.
     */
    byte *xsp = get_sigstack_frame_ptr(dcontext, sig,
                                       our_frame/* take xsp from (translated)
                                                 * interruption point */);

#ifdef CLIENT_INTERFACE
    dr_signal_action_t action =
        send_signal_to_client(dcontext, sig, our_frame, sc_orig, access_address,
                              false/*not blocked*/, f);
    if (!handle_client_action_from_cache(dcontext, sig, action, our_frame, sc_orig,
                                         false/*!blocked*/))
        return false;
#else
    if (info->app_sigaction[sig] == NULL ||
        info->app_sigaction[sig]->handler == (handler_t)SIG_DFL) {
        LOG(THREAD, LOG_ASYNCH, 3, "\taction is SIG_DFL\n");
        if (execute_default_from_cache(dcontext, sig, our_frame, sc_orig)) {
            /* if we haven't terminated, restore original (untranslated) sc
             * on request.
             */
            *get_sigcontext_from_rt_frame(our_frame) = *sc_orig;
        }
        return false;
    }
    ASSERT(info->app_sigaction[sig] != NULL &&
           info->app_sigaction[sig]->handler != (handler_t)SIG_IGN &&
           info->app_sigaction[sig]->handler != (handler_t)SIG_DFL);
#endif

    LOG(THREAD, LOG_ASYNCH, 2, "execute_handler_from_cache for signal %d\n", sig);
    RSTATS_INC(num_signals);

    /* now that we know it's not a client-involved fault, dump as app fault */
    report_app_problem(dcontext, APPFAULT_FAULT, (byte *)sc->SC_XIP, (byte *)sc->SC_FP,
                       "\nSignal %d delivered to application handler.\n", sig);

    LOG(THREAD, LOG_ASYNCH, 3, "\txsp is "PFX"\n", xsp);

    /* copy frame to appropriate stack and convert to non-rt if necessary */
    copy_frame_to_stack(dcontext, sig, our_frame, (void *)xsp, false/*!pending*/);
    LOG(THREAD, LOG_ASYNCH, 3, "\tcopied frame from "PFX" to "PFX"\n", our_frame, xsp);

    /* Because of difficulties determining when/if a signal handler
     * returns, we do what the kernel does: abandon all of our current
     * state, copy what we might need to the handler frame if we come back,
     * and then it's ok if the handler doesn't return.
     * If it does, we start interpreting afresh when we see sigreturn().
     * This routine assumes anything needed to return has been put in the
     * frame (only needed for signals queued up while in dynamo), and goes
     * ahead and trashes the current dcontext.
     */

    /* if we were building a trace, kill it */
    if (is_building_trace(dcontext)) {
        LOG(THREAD, LOG_ASYNCH, 3, "\tsquashing trace-in-progress\n");
        trace_abort(dcontext);
    }

    /* add to set of blocked signals those in sigaction mask */
    blocked = info->app_sigaction[sig]->mask;
    /* SA_NOMASK says whether to block sig itself or not */
    if ((info->app_sigaction[sig]->flags & SA_NOMASK) == 0)
        kernel_sigaddset(&blocked, sig);
    set_blocked(dcontext, &blocked, false/*relative: OR these in*/);

    /* Set our sigreturn context (NOT for the app: we already copied the
     * translated context to the app stack) to point to fcache_return!
     * Then we'll go back through kernel, appear in fcache_return,
     * and go through dispatch & interp, without messing up DR stack.
     */
    transfer_from_sig_handler_to_fcache_return
        (dcontext, sc,
         /* Make sure handler is next thing we execute */
         (app_pc) SIGACT_PRIMARY_HANDLER(info->app_sigaction[sig]),
         (linkstub_t *) get_sigreturn_linkstub());
    /* Doesn't matter what most app registers are, signal handler doesn't
     * expect anything except the frame on the stack.  We do need to set xsp,
     * only because if app wants special signal stack we need to point xsp
     * there.  (If no special signal stack, this is a nop.)
     */
    sc->SC_XSP = (ptr_uint_t) xsp;
#ifdef X64
    /* Set up args to handler: int sig, siginfo_t *siginfo, kernel_ucontext_t *ucxt */
    sc->SC_XDI = sig;
    sc->SC_XSI = (reg_t) &((sigframe_rt_t *)xsp)->info;
    sc->SC_XDX = (reg_t) &((sigframe_rt_t *)xsp)->uc;
#endif

    if ((info->app_sigaction[sig]->flags & SA_ONESHOT) != 0) {
        /* clear handler now -- can't delete memory since sigreturn,
         * others may look at sigaction struct, so we just set to default
         */
        info->app_sigaction[sig]->handler = (handler_t) SIG_DFL;
    }

    LOG(THREAD, LOG_ASYNCH, 3, "\tset next_tag to handler "PFX", xsp to "PFX"\n",
        SIGACT_PRIMARY_HANDLER(info->app_sigaction[sig]), xsp);
    return true;
}

static bool
execute_handler_from_dispatch(dcontext_t *dcontext, int sig)
{
    thread_sig_info_t *info = (thread_sig_info_t *) dcontext->signal_field;
    byte *xsp = get_sigstack_frame_ptr(dcontext, sig, NULL);
    sigframe_rt_t *frame = &(info->sigpending[sig]->rt_frame);
    priv_mcontext_t *mcontext = get_mcontext(dcontext);
    sigcontext_t *sc;
    kernel_sigset_t blocked;

#ifdef CLIENT_INTERFACE
    dr_signal_action_t action;
#else
    if (info->app_sigaction[sig] == NULL ||
        info->app_sigaction[sig]->handler == (handler_t)SIG_DFL) {
        LOG(THREAD, LOG_ASYNCH, 3, "\taction is SIG_DFL\n");
        execute_default_from_dispatch(dcontext, sig, frame);
        return true;
    }
    ASSERT(info->app_sigaction[sig] != NULL &&
           info->app_sigaction[sig]->handler != (handler_t)SIG_IGN &&
           info->app_sigaction[sig]->handler != (handler_t)SIG_DFL);
#endif

    LOG(THREAD, LOG_ASYNCH, 2, "execute_handler_from_dispatch for signal %d\n", sig);
    RSTATS_INC(num_signals);

    /* modify the rtframe before copying to stack so we can pass final
     * version to client, and propagate its mods
     */
    sc = get_sigcontext_from_rt_frame(frame);

    /* Because of difficulties determining when/if a signal handler
     * returns, we do what the kernel does: abandon all of our current
     * state, copy what we might need to the handler frame if we come back,
     * and then it's ok if the handler doesn't return.
     * If it does, we start interpreting afresh when we see sigreturn().
     */

#ifdef DEBUG
    if (stats->loglevel >= 3 && (stats->logmask & LOG_ASYNCH) != 0) {
        LOG(THREAD, LOG_ASYNCH, 3, "original sigcontext "PFX":\n", sc);
        dump_sigcontext(dcontext, sc);
    }
#endif
    if (info->sigpending[sig]->use_sigcontext) {
        LOG(THREAD, LOG_ASYNCH, 2,
            "%s: using sigcontext, not mcontext (syscall restart)\n", __FUNCTION__);
    } else {
        /* copy currently-interrupted-context to frame's context, so we can
         * abandon the currently-interrupted context.
         */
        mcontext_to_sigcontext(sc, mcontext);
    }
    /* mcontext does not contain fp or mmx or xmm state, which may have
     * changed since the frame was created (while finishing up interrupted
     * fragment prior to returning to dispatch).  Since DR does not touch
     * this state except for xmm on x64, we go ahead and copy the
     * current state into the frame, and then touch up xmm for x64.
     */
    /* FIXME: should this be done for all pending as soon as reach
     * dispatch?  what if get two asynch inside same frag prior to exiting
     * cache?  have issues with fpstate, but also prob with next_tag? FIXME
     */
    /* FIXME: we should clear fpstate for app handler itself as that's
     * how our own handler is executed.
     */
#if defined(LINUX) && defined(X86)
    ASSERT(sc->fpstate != NULL); /* not doing i#641 yet */
    save_fpstate(dcontext, frame);
#endif /* LINUX && X86 */
#ifdef DEBUG
    if (stats->loglevel >= 3 && (stats->logmask & LOG_ASYNCH) != 0) {
        LOG(THREAD, LOG_ASYNCH, 3, "new sigcontext "PFX":\n", sc);
        dump_sigcontext(dcontext, sc);
        LOG(THREAD, LOG_ASYNCH, 3, "\n");
    }
#endif
    /* FIXME: other state?  debug regs?
     * if no syscall allowed between master_ (when frame created) and
     * receiving, then don't have to worry about debug regs, etc.
     * check for syscall when record pending, if it exists, try to
     * receive in pre_system_call or something? what if ignorable?  FIXME!
     */

    if (!info->sigpending[sig]->use_sigcontext) {
        /* for the pc we want the app pc not the cache pc */
        sc->SC_XIP = (ptr_uint_t) dcontext->next_tag;
        LOG(THREAD, LOG_ASYNCH, 3, "\tset frame's eip to "PFX"\n", sc->SC_XIP);
    }

#ifdef CLIENT_INTERFACE
    action = send_signal_to_client(dcontext, sig, frame, NULL,
                                   info->sigpending[sig]->access_address,
                                   false/*not blocked*/, NULL);
    /* in order to pass to the client, we come all the way here for signals
     * the app has no handler for
     */
    if (action == DR_SIGNAL_REDIRECT) {
        /* send_signal_to_client copied mcontext into frame's sc */
        sigcontext_to_mcontext(get_mcontext(dcontext), sc);
        dcontext->next_tag = (app_pc) sc->SC_XIP;
        if (is_building_trace(dcontext)) {
            LOG(THREAD, LOG_ASYNCH, 3, "\tsquashing trace-in-progress\n");
            trace_abort(dcontext);
        }
        return true; /* don't try another signal */
    }
    else if (action == DR_SIGNAL_SUPPRESS ||
        (info->app_sigaction[sig] != NULL &&
         info->app_sigaction[sig]->handler == (handler_t)SIG_IGN)) {
        LOG(THREAD, LOG_ASYNCH, 2, "%s: not delivering!\n",
            (action == DR_SIGNAL_SUPPRESS) ?
            "client suppressing signal" :
            "app signal handler is SIG_IGN");
        return false;
    }
    else if (action == DR_SIGNAL_BYPASS ||
        (info->app_sigaction[sig] == NULL ||
         info->app_sigaction[sig]->handler == (handler_t)SIG_DFL)) {
        LOG(THREAD, LOG_ASYNCH, 2, "%s: executing default action\n",
            (action == DR_SIGNAL_BYPASS) ?
            "client forcing default" :
            "app signal handler is SIG_DFL");
        if (info->sigpending[sig]->use_sigcontext) {
            /* after the default action we want to go to the sigcontext */
            dcontext->next_tag = (app_pc) sc->SC_XIP;
            sigcontext_to_mcontext(get_mcontext(dcontext), sc);
        }
        execute_default_from_dispatch(dcontext, sig, frame);
        return true;
    }
    CLIENT_ASSERT(action == DR_SIGNAL_DELIVER, "invalid signal event return value");
#endif

    /* now that we've made all our changes and given the client a
     * chance to make changes, copy the frame to the appropriate stack
     * location and convert to non-rt if necessary
     */
    copy_frame_to_stack(dcontext, sig, frame, xsp, true/*pending*/);
    /* now point at the app's frame */
    sc = get_sigcontext_from_app_frame(info, sig, (void *) xsp);

    ASSERT(info->app_sigaction[sig] != NULL);

    /* add to set of blocked signals those in sigaction mask */
    blocked = info->app_sigaction[sig]->mask;
    /* SA_NOMASK says whether to block sig itself or not */
    if ((info->app_sigaction[sig]->flags & SA_NOMASK) == 0)
        kernel_sigaddset(&blocked, sig);
    set_blocked(dcontext, &blocked, false/*relative: OR these in*/);

    /* if we were building a trace, kill it */
    if (is_building_trace(dcontext)) {
        LOG(THREAD, LOG_ASYNCH, 3, "\tsquashing trace-in-progress\n");
        trace_abort(dcontext);
    }

    /* Doesn't matter what most app registers are, signal handler doesn't
     * expect anything except the frame on the stack.  We do need to set xsp.
     */
    mcontext->xsp = (ptr_uint_t) xsp;
#ifdef X64
    /* Set up args to handler: int sig, siginfo_t *siginfo, kernel_ucontext_t *ucxt */
    mcontext->xdi = sig;
    mcontext->xsi = (reg_t) &((sigframe_rt_t *)xsp)->info;
    mcontext->xdx = (reg_t) &((sigframe_rt_t *)xsp)->uc;
#endif
#ifdef X86
    /* Clear eflags DF (signal handler should match function entry ABI) */
    mcontext->xflags &= ~EFLAGS_DF;
#endif
    /* Make sure handler is next thing we execute */
    dcontext->next_tag = (app_pc) SIGACT_PRIMARY_HANDLER(info->app_sigaction[sig]);

    if ((info->app_sigaction[sig]->flags & SA_ONESHOT) != 0) {
        /* clear handler now -- can't delete memory since sigreturn,
         * others may look at sigaction struct, so we just set to default
         */
        info->app_sigaction[sig]->handler = (handler_t) SIG_DFL;
    }

    LOG(THREAD, LOG_ASYNCH, 3, "\tset xsp to "PFX"\n", xsp);
    return true;
}

/* The arg to SYS_kill, i.e., the signal number, should be in dcontext->sys_param0 */
static void
terminate_via_kill(dcontext_t *dcontext)
{
    ASSERT(dcontext == get_thread_private_dcontext());

    /* FIXME PR 541760: there can be multiple thread groups and thus
     * this may not exit all threads in the address space
     */
    cleanup_and_terminate(dcontext, SYS_kill,
                          /* Pass -pid in case main thread has exited
                           * in which case will get -ESRCH
                           */
                          IF_VMX86(os_in_vmkernel_userworld() ?
                                   -(int)get_process_id() :)
                          get_process_id(),
                          dcontext->sys_param0, true, 0, 0);
    ASSERT_NOT_REACHED();
}

bool
is_currently_on_sigaltstack(dcontext_t *dcontext)
{
    thread_sig_info_t *info = (thread_sig_info_t *) dcontext->signal_field;
    byte *cur_esp;
    GET_STACK_PTR(cur_esp);
    return (cur_esp >= (byte *)info->sigstack.ss_sp &&
            cur_esp <  (byte *)info->sigstack.ss_sp + info->sigstack.ss_size);
}

static void
terminate_via_kill_from_anywhere(dcontext_t *dcontext, int sig)
{
    dcontext->sys_param0 = sig; /* store arg to SYS_kill */
    if (is_currently_on_sigaltstack(dcontext)) {
        /* We can't clean up our sigstack properly when we're on it
         * (i#1160) so we terminate on the dstack.
         */
        call_switch_stack(dcontext, dcontext->dstack, terminate_via_kill,
                          NULL/*!initstack */, false/*no return */);
    } else {
        terminate_via_kill(dcontext);
    }
    ASSERT_NOT_REACHED();
}

/* xref os_request_fatal_coredump() */
void
os_terminate_via_signal(dcontext_t *dcontext, terminate_flags_t flags, int sig)
{
    if (sig != SIGKILL && sig != SIGSTOP) {
        DEBUG_DECLARE(bool res =)
            set_default_signal_action(sig);
        ASSERT(res);
    }
    if (TEST(TERMINATE_CLEANUP, flags)) {
        /* we enter from several different places, so rewind until top-level kstat */
        KSTOP_REWIND_UNTIL(thread_measured);
        ASSERT(dcontext != NULL);
        dcontext->sys_param0 = sig;
        /* XXX: the comment in the else below implies some systems have SYS_kill
         * of SIGSEGV w/ no handler on oneself actually return.
         * cleanup_and_terminate won't return to us and will use global_do_syscall
         * to invoke SYS_kill, which in debug will do an inf loop (good!) but
         * in release will do SYS_exit_group -- oh well, the systems I'm testing
         * on do an immediate exit.
         */
        terminate_via_kill_from_anywhere(dcontext, sig);
    } else {
        /* general clean up is unsafe: just remove .1config file */
        config_exit();
        dynamorio_syscall(SYS_kill, 2, get_process_id(), sig);
        /* We try both the SYS_kill and the immediate crash since on some platforms
         * the SIGKILL is delayed and on others the *-1 is hanging(?): should investigate
         */
        if (sig == SIGSEGV) /* make doubly-sure */
            *((int *)PTR_UINT_MINUS_1) = 0;
        while (true) {
            /* in case signal delivery is delayed we wait...forever */
            os_thread_yield();
        }
    }
    ASSERT_NOT_REACHED();
}

static bool
execute_default_action(dcontext_t *dcontext, int sig, sigframe_rt_t *frame,
                       sigcontext_t *sc_orig, bool from_dispatch)
{
    thread_sig_info_t *info = (thread_sig_info_t *) dcontext->signal_field;
    sigcontext_t *sc = get_sigcontext_from_rt_frame(frame);
    byte *pc = (byte *) sc->SC_XIP;

    LOG(THREAD, LOG_ASYNCH, 3, "execute_default_action for signal %d\n", sig);

    /* should only come here for signals we catch, or signal with ONESHOT
     * that didn't sigreturn
     */
    ASSERT(info->we_intercept[sig] ||
           (info->app_sigaction[sig]->flags & SA_ONESHOT) != 0);

    if (info->app_sigaction[sig] != NULL &&
        (info->app_sigaction[sig]->flags & SA_ONESHOT) != 0) {
        if (!info->we_intercept[sig]) {
            handler_free(dcontext, info->app_sigaction[sig], sizeof(kernel_sigaction_t));
            info->app_sigaction[sig] = NULL;
        }
    }

    /* FIXME PR 205310: we can't always perfectly emulate the default
     * behavior.  To execute the default action, we have to un-register our
     * handler, if we have one, for signals whose default action is not
     * ignore or that will just be re-raised upon returning to the
     * interrupted context -- FIXME: are any of the ignores repeated?
     * SIGURG?
     *
     * If called from execute_handler_from_cache(), our master_signal_handler()
     * is going to return directly to the translated context: which means we
     * go native to re-execute the instr, which if it does in fact generate
     * the signal again means we have a nice transparent core dump.
     *
     * If called from execute_handler_from_dispatch(), we need to generate
     * the signal ourselves.
     */
    if (default_action[sig] != DEFAULT_IGNORE) {
        DEBUG_DECLARE(bool ok =)
            set_default_signal_action(sig);
        ASSERT(ok);

        /* FIXME: to avoid races w/ shared handlers should set a flag to
         * prevent another thread from re-enabling.
         * Perhaps worse: what if this signal arrives for another thread
         * in the meantime (and the default is not terminate)?
         */
        if (info->shared_app_sigaction) {
            LOG(THREAD, LOG_ASYNCH, 1,
                "WARNING: having to install SIG_DFL for thread "TIDFMT", but will be shared!\n",
                get_thread_id());
        }
        if (default_action[sig] == DEFAULT_TERMINATE ||
            default_action[sig] == DEFAULT_TERMINATE_CORE) {
            report_app_problem(dcontext, APPFAULT_CRASH, pc, (byte *)sc->SC_FP,
                               "\nSignal %d delivered to application as default action.\n",
                               sig);
            /* N.B.: we don't have to restore our handler because the
             * default action is for the process (entire thread group for NPTL) to die!
             */
            if (from_dispatch ||
                can_always_delay[sig] ||
                is_sys_kill(dcontext, pc, (byte*)sc->SC_XSP, &frame->info)) {
                /* This must have come from SYS_kill rather than raised by
                 * a faulting instruction.  Thus we can't go re-execute the
                 * instr in order to re-raise the signal (if from_dispatch,
                 * we delayed and can't re-execute anyway).  Instead we
                 * re-generate via SYS_kill.  An alternative, if we don't
                 * care about generating a core dump, is to use SYS_exit
                 * and pass the right exit code to indicate the signal
                 * number: that would avoid races w/ the sigaction.
                 *
                 * FIXME: should have app make the syscall to get a more
                 * transparent core dump!
                 */
                if (!from_dispatch)
                    KSTOP_NOT_MATCHING_NOT_PROPAGATED(fcache_default);
                KSTOP_NOT_MATCHING_NOT_PROPAGATED(dispatch_num_exits);
                if (is_couldbelinking(dcontext)) /* won't be for SYS_kill (i#1159) */
                    enter_nolinking(dcontext, NULL, false);
                /* we could be on sigstack so call this version: */
                terminate_via_kill_from_anywhere(dcontext, sig);
                ASSERT_NOT_REACHED();
            } else {
                /* We assume that re-executing the interrupted instr will
                 * re-raise the fault.  We could easily be wrong:
                 * xref PR 363811 infinite loop due to memory we
                 * thought was unreadable and thus thought would raise
                 * a signal; xref PR 368277 to improve is_sys_kill().
                 * FIXME PR 205310: we should check whether we come out of
                 * the cache when we expected to terminate!
                 *
                 * An alternative is to abandon transparent core dumps and
                 * do the same explicit SYS_kill we do for from_dispatch.
                 * That would let us clean up DR as well.
                 * FIXME: currently we do not clean up DR for a synchronous
                 * signal death, but we do for asynch.
                 */
                /* i#552: cleanup and raise client exit event */
                int   instr_sz;
                thread_sig_info_t *info;
                /* We are on the sigstack now, so assign it to NULL to avoid being
                 * freed during process exit cleanup
                 */
                info = (thread_sig_info_t *)dcontext->signal_field;
                info->sigstack.ss_sp = NULL;
                /* We enter from several different places, so rewind until
                 * top-level kstat.
                 */
                KSTOP_REWIND_UNTIL(thread_measured);
                /* We try to raise the same signal in app's context so a correct
                 * coredump can be generated. However, the client might change
                 * the code in a way that the corresponding app code won't
                 * raise the signal, so we first check if the app instr is the
                 * same as instr in the cache, and raise the signal (by return).
                 * Otherwise, we kill the process instead.
                 */
                ASSERT(sc_orig != NULL);
                instr_sz = decode_sizeof(dcontext, (byte *) sc_orig->SC_XIP,
                                         NULL _IF_X64(NULL));
                if (instr_sz != 0 &&
                    instr_sz == decode_sizeof(dcontext, pc, NULL _IF_X64(NULL)) &&
                    memcmp(pc, (byte *) sc_orig->SC_XIP, instr_sz) == 0) {
                    /* the app instr matches the cache instr; cleanup and raise the
                     * the signal in the app context
                     */
                    dynamo_process_exit();
                    /* we cannot re-enter the cache, which is freed by now */
                    ASSERT(!from_dispatch);
                    return false;
                } else {
                    /* mismatch, cleanup and terminate */
                    dcontext->sys_param0 = sig;
                    terminate_via_kill(dcontext);
                    ASSERT_NOT_REACHED();
                }
            }
        } else {
            /* FIXME PR 297033: in order to intercept DEFAULT_STOP /
             * DEFAULT_CONTINUE signals, we need to set sigcontext to point
             * to some kind of regain-control routine, so that when our
             * thread gets to run again we can reset our handler.  So far
             * we have no signals that fall here that we intercept.
             */
            CLIENT_ASSERT(false, "STOP/CONT signals not supported");
        }
#if defined(DEBUG) && defined(INTERNAL)
        if (sig == SIGSEGV && !dynamo_exited) {
            /* pc should be an app pc at this point (it was translated) --
             * check for bad cases here
             */
            if (safe_is_in_fcache(dcontext, pc, (byte *)sc->SC_XSP)) {
                fragment_t wrapper;
                fragment_t *f;
                LOG(THREAD, LOG_ALL, 1,
                    "Received SIGSEGV at pc "PFX" in thread "TIDFMT"\n", pc, get_thread_id());
                f = fragment_pclookup(dcontext, pc, &wrapper);
                if (f)
                    disassemble_fragment(dcontext, f, false);
                ASSERT_NOT_REACHED();
            } else if (in_generated_routine(dcontext, pc)) {
                LOG(THREAD, LOG_ALL, 1,
                    "Received SIGSEGV at generated non-code-cache pc "PFX"\n", pc);
                ASSERT_NOT_REACHED();
            }
        }
#endif
    }

    /* now continue at the interruption point and re-raise the signal */
    return true;
}

static bool
execute_default_from_cache(dcontext_t *dcontext, int sig, sigframe_rt_t *frame,
                           sigcontext_t *sc_orig)
{
    return execute_default_action(dcontext, sig, frame, sc_orig, false);
}

static void
execute_default_from_dispatch(dcontext_t *dcontext, int sig, sigframe_rt_t *frame)
{
    execute_default_action(dcontext, sig, frame, NULL, true);
}

void
receive_pending_signal(dcontext_t *dcontext)
{
    thread_sig_info_t *info = (thread_sig_info_t *) dcontext->signal_field;
    sigpending_t *temp;
    int sig;
    LOG(THREAD, LOG_ASYNCH, 3, "receive_pending_signal\n");
    if (info->interrupted != NULL) {
        LOG(THREAD, LOG_ASYNCH, 3, "\tre-linking outgoing for interrupted F%d\n",
            info->interrupted->id);
        SHARED_FLAGS_RECURSIVE_LOCK(info->interrupted->flags, acquire,
                                    change_linking_lock);
        link_fragment_outgoing(dcontext, info->interrupted, false);
        SHARED_FLAGS_RECURSIVE_LOCK(info->interrupted->flags, release,
                                    change_linking_lock);
        if (TEST(FRAG_HAS_SYSCALL, info->interrupted->flags)) {
            /* restore syscall (they're a barrier to signals, so signal
             * handler has cur frag exit before it does a syscall)
             */
            ASSERT(info->interrupted_pc != NULL);
            mangle_syscall_code(dcontext, info->interrupted,
                                info->interrupted_pc, true/*skip exit cti*/);
        }
        info->interrupted = NULL;
        info->interrupted_pc = NULL;
    }
    /* grab first pending signal
     * XXX: start with real-time ones?
     */
    /* "lock" the array to prevent a new signal that interrupts this bit of
     * code from prepended or deleting from the array while we're accessing it
     */
    info->accessing_sigpending = true;
    /* barrier to prevent compiler from moving the above write below the loop */
    __asm__ __volatile__("" : : : "memory");
    for (sig = 1; sig <= MAX_SIGNUM; sig++) {
        if (info->sigpending[sig] != NULL) {
            bool executing = true;
            /* We do not re-check whether blocked if it was unblocked at
             * receive time, to properly handle sigsuspend (i#1340).
             */
            if (!info->sigpending[sig]->unblocked &&
                !kernel_sigismember(&info->app_sigblocked, sig)) {
                LOG(THREAD, LOG_ASYNCH, 3, "\tsignal %d is blocked!\n", sig);
                continue;
            }
            LOG(THREAD, LOG_ASYNCH, 3, "\treceiving signal %d\n", sig);
            executing = execute_handler_from_dispatch(dcontext, sig);
            temp = info->sigpending[sig];
            info->sigpending[sig] = temp->next;
            special_heap_free(info->sigheap, temp);

            /* only one signal at a time! */
            if (executing)
                break;
        }
    }
    /* barrier to prevent compiler from moving the below write above the loop */
    __asm__ __volatile__("" : : : "memory");
    info->accessing_sigpending = false;

    /* we only clear this on a call to us where we find NO pending signals */
    if (sig > MAX_SIGNUM) {
        LOG(THREAD, LOG_ASYNCH, 3, "\tclearing signals_pending flag\n");
        dcontext->signals_pending = false;
    }
}

/* Returns false if should NOT issue syscall. */
bool
#ifdef LINUX
handle_sigreturn(dcontext_t *dcontext, bool rt)
#else
handle_sigreturn(dcontext_t *dcontext, void *ucxt_param, int style)
#endif
{
    thread_sig_info_t *info = (thread_sig_info_t *) dcontext->signal_field;
    sigcontext_t *sc = NULL; /* initialize to satisfy Mac clang */
    int sig = 0;
    app_pc next_pc;
    /* xsp was put in mcontext prior to pre_system_call() */
    reg_t xsp = get_mcontext(dcontext)->xsp;
#ifdef MACOS
    bool rt = true;
#endif

    LOG(THREAD, LOG_ASYNCH, 3, "%ssigreturn()\n", rt?"rt_":"");
    LOG(THREAD, LOG_ASYNCH, 3, "\txsp is "PFX"\n", xsp);

#ifdef PROGRAM_SHEPHERDING
    /* if (!sig_has_restorer, region was never added to exec list,
     * allowed as pattern only and kicked off at first write via
     * selfmod detection or otherwise if vsyscall, so no worries
     * about having to remove it here
     */
#endif

    /* get sigframe: it's the top thing on the stack, except the ret
     * popped off pretcode.
     * WARNING: handler for tcsh's window_change (SIGWINCH) clobbers its
     * signal # arg, so don't use frame->sig!  (kernel doesn't look at sig
     * so app can get away with it)
     */
    if (rt) {
        kernel_ucontext_t *ucxt;
#ifdef LINUX
        sigframe_rt_t *frame = (sigframe_rt_t *) (xsp - sizeof(char*));
        /* use si_signo instead of sig, less likely to be clobbered by app */
        sig = frame->info.si_signo;
# ifndef X64
        LOG(THREAD, LOG_ASYNCH, 3, "\tsignal was %d (did == param %d)\n",
            sig, frame->sig);
        if (frame->sig != sig)
            LOG(THREAD, LOG_ASYNCH, 1, "WARNING: app sig handler clobbered sig param\n");
# endif
        sc = get_sigcontext_from_app_frame(info, sig, (void *) frame);
        ucxt = &frame->uc;
#elif defined(MACOS)
        /* The initial frame fields on the stack are messed up due to
         * params to handler from tramp, so use params to syscall.
         * XXX: we don't have signal # though: so we have to rely on app
         * not clobbering the param field.
         */
        sig = *(int*)xsp;
        LOG(THREAD, LOG_ASYNCH, 3, "\tsignal was %d\n", sig);
        ucxt = (kernel_ucontext_t *) ucxt_param;
        sc = SIGCXT_FROM_UCXT(ucxt);
#endif
        ASSERT(sig > 0 && sig <= MAX_SIGNUM && IS_RT_FOR_APP(info, sig));

        /* FIXME: what if handler called sigaction and requested rt
         * when itself was non-rt?
         */

        /* discard blocked signals, re-set from prev mask stored in frame */
        set_blocked(dcontext, SIGMASK_FROM_UCXT(ucxt), true/*absolute*/);
    }
#ifdef LINUX
    else {
        /* FIXME: libc's restorer pops prior to calling sigreturn, I have
         * no idea why, but kernel asks for xsp-8 not xsp-4...weird!
         */
        kernel_sigset_t prevset;
        sigframe_plain_t *frame = (sigframe_plain_t *) (xsp-8);
        /* We don't trust frame->sig (app sometimes clobbers it), and for
         * plain frame there's no other place that sig is stored,
         * so as a hack we added a new frame!
         * FIXME: this means we won't support nonstandard use of SYS_sigreturn,
         * e.g., as NtContinue, if frame didn't come from a real signal and so
         * wasn't copied to stack by us.
         */
        sig = frame->sig_noclobber;
        LOG(THREAD, LOG_ASYNCH, 3, "\tsignal was %d (did == param %d)\n",
            sig, frame->sig);
        if (frame->sig != sig)
            LOG(THREAD, LOG_ASYNCH, 1, "WARNING: app sig handler clobbered sig param\n");
        ASSERT(sig > 0 && sig <= MAX_SIGNUM && !IS_RT_FOR_APP(info, sig));
        sc = get_sigcontext_from_app_frame(info, sig, (void *) frame);
        /* discard blocked signals, re-set from prev mask stored in frame */
        prevset.sig[0] = frame->sc.oldmask;
        if (_NSIG_WORDS > 1)
            memcpy(&prevset.sig[1], &frame->extramask, sizeof(frame->extramask));
        set_blocked(dcontext, &prevset, true/*absolute*/);
    }
#endif

    /* Make sure we deliver pending signals that are now unblocked.
     */
    check_signals_pending(dcontext, info);

    /* We abandoned the previous context, so we need to start
     * interpreting anew.  Regardless of whether we handled the signal
     * from dispatch or the fcache, we want to go to the context
     * stored in the frame.  So we have the kernel send us to
     * fcache_return and set up for dispatch to use the frame's
     * context.
     */

    /* if we were building a trace, kill it */
    if (is_building_trace(dcontext)) {
        LOG(THREAD, LOG_ASYNCH, 3, "\tsquashing trace-in-progress\n");
        trace_abort(dcontext);
    }

    if ((info->app_sigaction[sig]->flags & SA_ONESHOT) != 0) {
        ASSERT(info->app_sigaction[sig]->handler == (handler_t) SIG_DFL);
        if (!info->we_intercept[sig]) {
            /* let kernel do default independent of us */
            handler_free(dcontext, info->app_sigaction[sig], sizeof(kernel_sigaction_t));
            info->app_sigaction[sig] = NULL;
        }
    }

    ASSERT(!safe_is_in_fcache(dcontext, (app_pc) sc->SC_XIP, (byte *)sc->SC_XSP));

#ifdef DEBUG
    if (stats->loglevel >= 3 && (stats->logmask & LOG_ASYNCH) != 0) {
        LOG(THREAD, LOG_ASYNCH, 3, "returning-to sigcontext "PFX":\n", sc);
        dump_sigcontext(dcontext, sc);
    }
#endif

    /* XXX i#1206: if we interrupted a non-ignorable syscall to run the app's
     * handler, and we set up to restart the syscall, we'll come here with the
     * translated syscall pc -- thus we can't distinguish from a signal interrupting
     * the prior app instr.  So we can't simply point at do_syscall and call
     * set_at_syscall -- we have to re-interpret the syscall and re-run the
     * pre-syscall handler.  Hopefully all our pre-syscall handlers can handle that.
     */

    /* set up for dispatch */
    /* we have to use a different slot since next_tag ends up holding the do_syscall
     * entry when entered from dispatch (we're called from pre_syscall, prior to entering cache)
     */
    dcontext->asynch_target = (app_pc) sc->SC_XIP;
    next_pc = dcontext->asynch_target;

#ifdef VMX86_SERVER
    /* PR 404712: kernel only restores gp regs so we do it ourselves and avoid
     * complexities of kernel's non-linux-like sigreturn semantics
     */
    sigcontext_to_mcontext(get_mcontext(dcontext), sc);
#else
    /* HACK to get eax put into mcontext AFTER do_syscall */
    dcontext->next_tag = (app_pc) sc->IF_X86_ELSE(SC_XAX, SC_R0);
    /* use special linkstub so we know why we came out of the cache */
    sc->IF_X86_ELSE(SC_XAX, SC_R0) = (ptr_uint_t) get_sigreturn_linkstub();

    /* set our sigreturn context to point to fcache_return */
    sc->SC_XIP = (ptr_uint_t) fcache_return_routine(dcontext);

    /* if we overlaid inner frame on nested signal, will end up with this
     * error -- disable in release build since this is often app's fault (stack
     * too small)
     * FIXME: how make this transparent?  what ends up happening is that we
     * get a segfault when we start interpreting dispatch, we want to make it
     * look like whatever would happen to the app...
     */
    ASSERT((app_pc)sc->SC_XIP != next_pc);
#endif

    LOG(THREAD, LOG_ASYNCH, 3, "set next tag to "PFX", sc->SC_XIP to "PFX"\n",
        next_pc, sc->SC_XIP);

    return IF_VMX86_ELSE(false, true);
}

bool
is_signal_restorer_code(byte *pc, size_t *len)
{
    /* is this a sigreturn pattern placed by kernel on the stack or vsyscall page?
     * for non-rt frame:
     *    0x58           popl %eax
     *    0xb8 <sysnum>  movl SYS_sigreturn, %eax
     *    0xcd 0x80      int 0x80
     * for rt frame:
     *    0xb8 <sysnum>  movl SYS_rt_sigreturn, %eax
     *    0xcd 0x80      int 0x80
     */
    /* optimized we only need two uint reads, but we have to do
     * some little-endian byte-order reverses to get the right result
     */
#   define reverse(x) ((((x) & 0xff) << 24) | (((x) & 0xff00) << 8) | \
                       (((x) & 0xff0000) >> 8) | (((x) & 0xff000000) >> 24))
#ifdef MACOS
# define SYS_RT_SIGRET SYS_sigreturn
#else
# define SYS_RT_SIGRET SYS_rt_sigreturn
#endif
#ifndef X64
    /* 58 b8 s4 s3 s2 s1 cd 80 */
    static const uint non_rt_1w =  reverse(0x58b80000 | (reverse(SYS_sigreturn) >> 16));
    static const uint non_rt_2w = reverse((reverse(SYS_sigreturn) << 16) | 0xcd80);
#endif
    /* b8 s4 s3 s2 s1 cd 80 XX */
    static const uint rt_1w = reverse(0xb8000000 | (reverse(SYS_RT_SIGRET) >> 8));
    static const uint rt_2w = reverse((reverse(SYS_RT_SIGRET) << 24) | 0x00cd8000);
    /* test rt first as it's the most common
     * only 7 bytes here so we ignore the last one (becomes msb since little-endian)
     */
    if (*((uint *)pc) == rt_1w && (*((uint *)(pc+4)) & 0x00ffffff) == rt_2w) {
        if (len != NULL)
            *len = 7;
        return true;
    }
#ifndef X64
    if (*((uint *)pc) == non_rt_1w && *((uint *)(pc+4)) == non_rt_2w) {
        if (len != NULL)
            *len = 8;
        return true;
    }
#endif
    return false;
}


void
os_forge_exception(app_pc target_pc, dr_exception_type_t type)
{
    /* PR 205136:
     * We want to deliver now, and the caller expects us not to return.
     * We have two alternatives:
     * 1) Emulate stack frame, and call transfer_to_dispatch() for delivery.  We
     *    may not know how to fill out every field of the frame (cr2, etc.).  Plus,
     *    we have problems w/ default actions (PR 205310) but we have to solve
     *    those long-term anyway.  We also have to create different frames based on
     *    whether app intercepts via rt or not.
     * 2) Call SYS_tgkill from a special location that our handler can
     *    recognize and know it's a signal meant for the app and that the
     *    interrupted DR can be discarded.  We'd then essentially repeat 1,
     *    but modifying the kernel-generated frame.  We'd have to always
     *    intercept SIGILL.
     * I'm going with #1 for now b/c the common case is simpler.
     */
    dcontext_t *dcontext = get_thread_private_dcontext();
#ifdef LINUX
    thread_sig_info_t *info = (thread_sig_info_t *) dcontext->signal_field;
#endif
    char frame_plus_xstate[sizeof(sigframe_rt_t)IF_X86( + AVX_FRAME_EXTRA)];
    sigframe_rt_t *frame = (sigframe_rt_t *) frame_plus_xstate;
    int sig;
    where_am_i_t cur_whereami = dcontext->whereami;
    sigcontext_t *sc = get_sigcontext_from_rt_frame(frame);
    switch (type) {
    case ILLEGAL_INSTRUCTION_EXCEPTION: sig = SIGILL; break;
    case UNREADABLE_MEMORY_EXECUTION_EXCEPTION: sig = SIGSEGV; break;
    case IN_PAGE_ERROR_EXCEPTION: /* fall-through: Windows only */
    default: ASSERT_NOT_REACHED(); sig = SIGSEGV; break;
    }

    LOG(GLOBAL, LOG_ASYNCH, 1, "os_forge_exception sig=%d\n", sig);

    /* since we always delay delivery, we always want an rt frame.  we'll convert
     * to a plain frame on delivery.
     */
    memset(frame, 0, sizeof(*frame));
    frame->info.si_signo = sig;
#ifndef X64
    frame->sig = sig;
    frame->pinfo = &frame->info;
    frame->puc = (void *) &frame->uc;
#endif
#if defined(LINUX) && defined(X86)
    sc->fpstate = (struct _fpstate *)
        ALIGN_FORWARD(frame_plus_xstate + sizeof(*frame), XSTATE_ALIGNMENT);
#endif /* LINUX && X86 */
    mcontext_to_sigcontext(sc, get_mcontext(dcontext));
    sc->SC_XIP = (reg_t) target_pc;
    /* we'll fill in fpstate at delivery time
     * FIXME: it seems to work w/o filling in the other state:
     * I'm leaving segments, cr2, etc. all zero.
     * Note that x64 kernel restore_sigcontext() only restores cs: it
     * claims onus is on app's signal handler for other segments.
     * We should try to share part of the GET_OWN_CONTEXT macro used for
     * Windows.  Or we can switch to approach #2.
     */
#ifdef LINUX
    if (sig_has_restorer(info, sig))
        frame->pretcode = (char *) info->app_sigaction[sig]->restorer;
    else
        frame->pretcode = (char *) dynamorio_sigreturn;
#endif

    /* We assume that we do not need to translate the context when forged.
     * If we did, we'd move this below enter_nolinking() (and update
     * record_pending_signal() to do the translation).
     */
    record_pending_signal(dcontext, sig, &frame->uc, frame, true/*forged*/
                          _IF_CLIENT(NULL));

    /* For most callers this is not necessary and we only do it to match
     * the Windows usage model: but for forging from our own handler,
     * this is good b/c it resets us to the base of dstack.
     */
    /* tell dispatch() why we're coming there */
    dcontext->whereami = WHERE_TRAMPOLINE;
    KSTART(dispatch_num_exits);
    /* we overload the meaning of the sigreturn linkstub */
    set_last_exit(dcontext, (linkstub_t *) get_sigreturn_linkstub());
    if (is_couldbelinking(dcontext))
        enter_nolinking(dcontext, NULL, false);
    transfer_to_dispatch(dcontext, get_mcontext(dcontext),
                         cur_whereami != WHERE_FCACHE &&
                         cur_whereami != WHERE_SIGNAL_HANDLER
                         /*full_DR_state*/);
    ASSERT_NOT_REACHED();
}

void
os_request_fatal_coredump(const char *msg)
{
    /* To enable getting a coredump just make sure that rlimits are
     * not preventing getting one, e.g. ulimit -c unlimited
     */
    SYSLOG_INTERNAL_ERROR("Crashing the process deliberately for a core dump!");
    os_terminate_via_signal(NULL, 0/*no cleanup*/, SIGSEGV);
    ASSERT_NOT_REACHED();
}

void
os_request_live_coredump(const char *msg)
{
#ifdef VMX86_SERVER
   if (os_in_vmkernel_userworld()) {
      vmk_request_live_coredump(msg);
      return;
   }
#endif
   LOG(GLOBAL, LOG_ASYNCH, 1, "LiveCoreDump unsupported (PR 365105).  "
       "Continuing execution without a core.\n");
   return;
}

void
os_dump_core(const char *msg)
{
    /* FIXME Case 3408: fork stack dump crashes on 2.6 kernel, so moving the getchar
     * ahead to aid in debugging */
    if (TEST(DUMPCORE_WAIT_FOR_DEBUGGER, dynamo_options.dumpcore_mask)) {
        SYSLOG_INTERNAL_ERROR("looping so you can use gdb to attach to pid %s",
                              get_application_pid());
        IF_CLIENT_INTERFACE(SYSLOG(SYSLOG_CRITICAL, WAITING_FOR_DEBUGGER, 2,
                                   get_application_name(), get_application_pid()));
        /* getchar() can hit our own vsyscall hook (from PR 212570); typically we
         * want to attach and not continue anyway, so doing an infinite loop:
         */
        while (true)
            os_thread_yield();
    }

    if (DYNAMO_OPTION(live_dump)) {
        os_request_live_coredump(msg);
    }

    if (TEST(DUMPCORE_INCLUDE_STACKDUMP, dynamo_options.dumpcore_mask)) {
        /* fork, dump core, then use gdb to get a stack dump
         * we can get into an infinite loop if there's a seg fault
         * in the process of doing this -- so we have a do-once test,
         * and if it failed we do the no-symbols dr callstack dump
         */
        static bool tried_stackdump = false;
        if (!tried_stackdump) {
            tried_stackdump = true;
            stackdump();
        } else {
            static bool tried_calldump = false;
            if  (!tried_calldump) {
                tried_calldump = true;
                dump_dr_callstack(STDERR);
            }
        }
    }

    if (!DYNAMO_OPTION(live_dump)) {
        os_request_fatal_coredump(msg);
        ASSERT_NOT_REACHED();
    }
}

#ifdef RETURN_AFTER_CALL
bool
at_known_exception(dcontext_t *dcontext, app_pc target_pc, app_pc source_fragment)
{
    /* There is a known exception in signal restorers and the Linux dynamic symbol resoulution */
    /* The latter we assume it is the only other recurring known exception,
       so the first time we pattern match to help make sure it is indeed _dl_runtime_resolve
       (since with LD_BIND_NOW it will never be called).  After that we compare with the known value. */

    static app_pc known_exception = 0;
    thread_sig_info_t *info = (thread_sig_info_t *) dcontext->signal_field;

    LOG(THREAD, LOG_INTERP, 1, "RCT: testing for KNOWN exception "PFX" "PFX"\n",
        target_pc, source_fragment);

    /* Check if this is a signal return.
       FIXME: we should really get that from the frame itself.
       Since currently grabbing restorer only when copying a frame,
       this will work with nested signals only if they all have same restorer
       (I haven't seen restorers other than the one in libc)
    */
    if (target_pc == info->signal_restorer_retaddr) {
        LOG(THREAD, LOG_INTERP, 1, "RCT: KNOWN exception this is a signal restorer --ok \n");
        STATS_INC(ret_after_call_signal_restorer);
        return true;
    }

    if (source_fragment == known_exception) {
        LOG(THREAD, LOG_INTERP, 1, "RCT: KNOWN exception again _dl_runtime_resolve --ok\n");
        return true;
    }

    if (known_exception == 0) {
        int ret_imm;
        return at_dl_runtime_resolve_ret(dcontext, source_fragment, &ret_imm);
    }
    return false;
}
#endif /* RETURN_AFTER_CALL */

/***************************************************************************
 * ITIMERS
 *
 * We support combining an app itimer with a DR itimer for each of the 3 types
 * (PR 204556).
 */

static inline uint64
timeval_to_usec(struct timeval *t1)
{
    return ((uint64)(t1->tv_sec))*1000000 + t1->tv_usec;
}

static inline void
usec_to_timeval(uint64 usec, struct timeval *t1)
{
    t1->tv_sec = (long) usec / 1000000;
    t1->tv_usec = (long) usec % 1000000;
}

static void
init_itimer(dcontext_t *dcontext, bool first)
{
    thread_sig_info_t *info = (thread_sig_info_t *) dcontext->signal_field;
    ASSERT(info != NULL);
    ASSERT(!info->shared_itimer); /* else inherit */
    LOG(THREAD, LOG_ASYNCH, 2, "thread has private itimers%s\n",
        os_itimers_thread_shared() ? " (for now)" : "");
    if (os_itimers_thread_shared()) {
        /* we have to allocate now even if no itimer is installed until later,
         * so that all child threads point to the same data
         */
        info->itimer = (thread_itimer_info_t (*)[NUM_ITIMERS])
            global_heap_alloc(sizeof(*info->itimer) HEAPACCT(ACCT_OTHER));
    } else {
        /* for simplicity and parllel w/ shared we allocate proactively */
        info->itimer = (thread_itimer_info_t (*)[NUM_ITIMERS])
            heap_alloc(dcontext, sizeof(*info->itimer) HEAPACCT(ACCT_OTHER));
    }
    memset(info->itimer, 0, sizeof(*info->itimer));
    if (first) {
        /* see if app has set up an itimer before we were loaded */
        struct itimerval prev;
        int rc;
        int which;
        for (which = 0; which < NUM_ITIMERS; which++) {
            rc = getitimer_syscall(which, &prev);
            ASSERT(rc == SUCCESS);
            (*info->itimer)[which].app.interval = timeval_to_usec(&prev.it_interval);
            (*info->itimer)[which].app.value = timeval_to_usec(&prev.it_value);
        }
    }
}

/* Up to caller to hold lock for shared itimers */
static bool
set_actual_itimer(dcontext_t *dcontext, int which, thread_sig_info_t *info,
                  bool enable)
{
    struct itimerval val;
    int rc;
    ASSERT(info != NULL && info->itimer != NULL);
    ASSERT(which >= 0 && which < NUM_ITIMERS);
    if (enable) {
        ASSERT(!info->shared_itimer || self_owns_recursive_lock(info->shared_itimer_lock));
        usec_to_timeval((*info->itimer)[which].actual.interval, &val.it_interval);
        usec_to_timeval((*info->itimer)[which].actual.value, &val.it_value);
        LOG(THREAD, LOG_ASYNCH, 2, "installing itimer %d interval="INT64_FORMAT_STRING
            ", value="INT64_FORMAT_STRING"\n", which,
            (*info->itimer)[which].actual.interval, (*info->itimer)[which].actual.value);
    } else {
        LOG(THREAD, LOG_ASYNCH, 2, "disabling itimer %d\n", which);
        memset(&val, 0, sizeof(val));
    }
    rc = setitimer_syscall(which, &val, NULL);
    return (rc == SUCCESS);
}

/* Caller should hold lock */
bool
itimer_new_settings(dcontext_t *dcontext, int which, bool app_changed)
{
    struct itimerval val;
    bool res = true;
    int rc;
    thread_sig_info_t *info = (thread_sig_info_t *) dcontext->signal_field;
    ASSERT(info != NULL && info->itimer != NULL);
    ASSERT(which >= 0 && which < NUM_ITIMERS);
    ASSERT(!info->shared_itimer || self_owns_recursive_lock(info->shared_itimer_lock));
    /* the general strategy is to set the actual value to the smaller,
     * update the larger on each signal, and when the larger becomes
     * smaller do a one-time swap for the remaining
     */
    if ((*info->itimer)[which].dr.interval > 0 &&
        ((*info->itimer)[which].app.interval == 0 ||
         (*info->itimer)[which].dr.interval < (*info->itimer)[which].app.interval))
        (*info->itimer)[which].actual.interval = (*info->itimer)[which].dr.interval;
    else
        (*info->itimer)[which].actual.interval = (*info->itimer)[which].app.interval;

    if ((*info->itimer)[which].actual.value > 0) {
        if ((*info->itimer)[which].actual.interval == 0 &&
            (*info->itimer)[which].dr.value == 0 &&
            (*info->itimer)[which].app.value == 0) {
            (*info->itimer)[which].actual.value = 0;
            res = set_actual_itimer(dcontext, which, info, false/*disabled*/);
        } else {
            /* one of app or us has an in-flight timer which we should not interrupt.
             * but, we already set the new requested value (for app or us), so we
             * need to update the actual value so we subtract properly.
             */
            rc = getitimer_syscall(which, &val);
            ASSERT(rc == SUCCESS);
            uint64 left = timeval_to_usec(&val.it_value);
            if (!app_changed &&
                (*info->itimer)[which].actual.value == (*info->itimer)[which].app.value)
                (*info->itimer)[which].app.value = left;
            if (app_changed &&
                (*info->itimer)[which].actual.value == (*info->itimer)[which].dr.value)
                (*info->itimer)[which].dr.value = left;
            (*info->itimer)[which].actual.value = left;
        }
    } else {
        if ((*info->itimer)[which].dr.value > 0 &&
            ((*info->itimer)[which].app.value == 0 ||
             (*info->itimer)[which].dr.value < (*info->itimer)[which].app.value))
            (*info->itimer)[which].actual.value = (*info->itimer)[which].dr.value;
        else {
            (*info->itimer)[which].actual.value = (*info->itimer)[which].app.value;
        }
        res = set_actual_itimer(dcontext, which, info, true/*enable*/);
    }
    return res;
}

bool
set_itimer_callback(dcontext_t *dcontext, int which, uint millisec,
                    void (*func)(dcontext_t *, priv_mcontext_t *),
                    void (*func_api)(dcontext_t *, dr_mcontext_t *))
{
    thread_sig_info_t *info = (thread_sig_info_t *) dcontext->signal_field;
    bool rc;
    if (which < 0 || which >= NUM_ITIMERS) {
        CLIENT_ASSERT(false, "invalid itimer type");
        return false;
    }
    if (func == NULL && func_api == NULL && millisec != 0) {
        CLIENT_ASSERT(false, "invalid function");
        return false;
    }
    ASSERT(info != NULL && info->itimer != NULL);
    if (info->shared_itimer)
        acquire_recursive_lock(info->shared_itimer_lock);
    (*info->itimer)[which].dr.interval = ((uint64)millisec)*1000;
    (*info->itimer)[which].dr.value = (*info->itimer)[which].dr.interval;
    (*info->itimer)[which].cb = func;
    (*info->itimer)[which].cb_api = func_api;
    rc = itimer_new_settings(dcontext, which, false/*us*/);
    if (info->shared_itimer)
        release_recursive_lock(info->shared_itimer_lock);
    return rc;
}

uint
get_itimer_frequency(dcontext_t *dcontext, int which)
{
    thread_sig_info_t *info = (thread_sig_info_t *) dcontext->signal_field;
    uint ms = 0;
    if (which < 0 || which >= NUM_ITIMERS) {
        CLIENT_ASSERT(false, "invalid itimer type");
        return 0;
    }
    ASSERT(info != NULL && info->itimer != NULL);
    if (info->shared_itimer)
        acquire_recursive_lock(info->shared_itimer_lock);
    ms = (*info->itimer)[which].dr.interval / 1000;
    if (info->shared_itimer)
        release_recursive_lock(info->shared_itimer_lock);
    return ms;
}

static bool
handle_alarm(dcontext_t *dcontext, int sig, kernel_ucontext_t *ucxt)
{
    thread_sig_info_t *info = (thread_sig_info_t *) dcontext->signal_field;
    ASSERT(info != NULL && info->itimer != NULL);
    sigcontext_t *sc = SIGCXT_FROM_UCXT(ucxt);
    int which = 0;
    bool invoke_cb = false, pass_to_app = false, reset_timer_manually = false;
    bool acquired_lock = false;

    /* i#471: suppress alarms coming in after exit */
    if (dynamo_exited)
        return pass_to_app;

    if (sig == SIGALRM)
        which = ITIMER_REAL;
    else if (sig == SIGVTALRM)
        which = ITIMER_VIRTUAL;
    else if (sig == SIGPROF)
        which = ITIMER_PROF;
    else
        ASSERT_NOT_REACHED();
    LOG(THREAD, LOG_ASYNCH, 2, "received alarm %d @"PFX"\n", which, sc->SC_XIP);

    /* This alarm could have interrupted an app thread making an itimer syscall */
    if (info->shared_itimer) {
        if (self_owns_recursive_lock(info->shared_itimer_lock)) {
            /* What can we do?  We just go ahead and hope conflicting writes work out.
             * We don't re-acquire in case app was in middle of acquiring.
             */
        } else if (try_recursive_lock(info->shared_itimer_lock) ||
                   try_recursive_lock(info->shared_itimer_lock)) {
            acquired_lock = true;
        } else {
            /* Heuristic: if fail twice then assume interrupted lock routine.
             * What can we do?  Just continue and hope conflicting writes work out.
             */
        }
    }
    if ((*info->itimer)[which].app.value > 0) {
        /* Alarm could have been on its way when app value changed */
        if ((*info->itimer)[which].app.value >= (*info->itimer)[which].actual.value) {
            (*info->itimer)[which].app.value -= (*info->itimer)[which].actual.value;
            LOG(THREAD, LOG_ASYNCH, 2,
                "\tapp value is now %d\n", (*info->itimer)[which].app.value);
            if ((*info->itimer)[which].app.value == 0) {
                pass_to_app = true;
                (*info->itimer)[which].app.value = (*info->itimer)[which].app.interval;
            } else
                reset_timer_manually = true;
        }
    }
    if ((*info->itimer)[which].dr.value > 0) {
        /* Alarm could have been on its way when DR value changed */
        if ((*info->itimer)[which].dr.value >= (*info->itimer)[which].actual.value) {
            (*info->itimer)[which].dr.value -= (*info->itimer)[which].actual.value;
            LOG(THREAD, LOG_ASYNCH, 2,
                "\tdr value is now %d\n", (*info->itimer)[which].dr.value);
            if ((*info->itimer)[which].dr.value == 0) {
                invoke_cb = true;
                (*info->itimer)[which].dr.value = (*info->itimer)[which].dr.interval;
            } else
                reset_timer_manually = true;
        }
    }
    /* for efficiency we let the kernel reset the value to interval if
     * there's only one timer
     */
    if (reset_timer_manually) {
        (*info->itimer)[which].actual.value = 0;
        itimer_new_settings(dcontext, which, true/*doesn't matter: actual.value==0*/);
    } else
        (*info->itimer)[which].actual.value = (*info->itimer)[which].actual.interval;

    if (invoke_cb) {
        /* invoke after setting new itimer value */
        /* we save stack space by allocating superset dr_mcontext_t */
        dr_mcontext_t dmc;
        priv_mcontext_t *mc;
        dr_mcontext_init(&dmc);
        mc = dr_mcontext_as_priv_mcontext(&dmc);
        sigcontext_to_mcontext(mc, sc);
        if ((*info->itimer)[which].cb != NULL)
            (*(*info->itimer)[which].cb)(dcontext, mc);
        else
            (*(*info->itimer)[which].cb_api)(dcontext, &dmc);
    }
    if (info->shared_itimer && acquired_lock)
        release_recursive_lock(info->shared_itimer_lock);
    return pass_to_app;
}

/* Starts itimer if stopped, or increases refcount of existing itimer if already
 * started.  It is *not* safe to call this more than once for the same thread,
 * since it will inflate the refcount and prevent cleanup.
 */
void
start_itimer(dcontext_t *dcontext)
{
    thread_sig_info_t *info = (thread_sig_info_t *) dcontext->signal_field;
    ASSERT(info != NULL && info->itimer != NULL);
    bool start = false;
    if (info->shared_itimer) {
        acquire_recursive_lock(info->shared_itimer_lock);
        (*info->shared_itimer_underDR)++;
        start = (*info->shared_itimer_underDR == 1);
    } else
        start = true;
    if (start) {
        /* Enable all DR itimers b/c at least one thread in this set of threads
         * sharing itimers is under DR control
         */
        int which;
        LOG(THREAD, LOG_ASYNCH, 2, "starting DR itimers from thread "TIDFMT"\n",
            get_thread_id());
        for (which = 0; which < NUM_ITIMERS; which++) {
            /* May have already been started if there was no stop_itimer() since
             * init time
             */
            if ((*info->itimer)[which].dr.value == 0 &&
                (*info->itimer)[which].dr.interval > 0) {
                (*info->itimer)[which].dr.value = (*info->itimer)[which].dr.interval;
                itimer_new_settings(dcontext, which, false/*!app*/);
            }
        }
    }
    if (info->shared_itimer)
        release_recursive_lock(info->shared_itimer_lock);
}

/* Decrements the itimer refcount, and turns off the itimer once there are no
 * more threads listening for it.  It is not safe to call this more than once on
 * the same thread.
 */
void
stop_itimer(dcontext_t *dcontext)
{
    thread_sig_info_t *info = (thread_sig_info_t *) dcontext->signal_field;
    ASSERT(info != NULL && info->itimer != NULL);
    bool stop = false;
    if (info->shared_itimer) {
        acquire_recursive_lock(info->shared_itimer_lock);
        ASSERT(*info->shared_itimer_underDR > 0);
        (*info->shared_itimer_underDR)--;
        stop = (*info->shared_itimer_underDR == 0);
    } else
        stop = true;
    if (stop) {
        /* Disable all DR itimers b/c this set of threads sharing this
         * itimer is now compmletely native
         */
        int which;
        LOG(THREAD, LOG_ASYNCH, 2, "stopping DR itimers from thread "TIDFMT"\n",
            get_thread_id());
        for (which = 0; which < NUM_ITIMERS; which++) {
            if ((*info->itimer)[which].dr.value > 0) {
                (*info->itimer)[which].dr.value = 0;
                if ((*info->itimer)[which].app.value > 0) {
                    (*info->itimer)[which].actual.interval =
                        (*info->itimer)[which].app.interval;
                } else
                    set_actual_itimer(dcontext, which, info, false/*disable*/);
            }
        }
    }
    if (info->shared_itimer)
        release_recursive_lock(info->shared_itimer_lock);
}

/* handle app itimer syscalls */
/* handle_pre_alarm also calls this function and passes NULL as prev_timer */
void
handle_pre_setitimer(dcontext_t *dcontext,
                     int which, const struct itimerval *new_timer,
                     struct itimerval *prev_timer)
{
    if (new_timer == NULL || which < 0 || which >= NUM_ITIMERS)
        return;
    thread_sig_info_t *info = (thread_sig_info_t *) dcontext->signal_field;
    ASSERT(info != NULL && info->itimer != NULL);
    struct itimerval val;
    if (safe_read(new_timer, sizeof(val), &val)) {
        if (info->shared_itimer)
            acquire_recursive_lock(info->shared_itimer_lock);
        /* save a copy in case the syscall fails */
        (*info->itimer)[which].app_saved = (*info->itimer)[which].app;
        (*info->itimer)[which].app.interval = timeval_to_usec(&val.it_interval);
        (*info->itimer)[which].app.value = timeval_to_usec(&val.it_value);
        LOG(THREAD, LOG_ASYNCH, 2,
            "app setitimer type=%d interval="SZFMT" value="SZFMT"\n",
            which, (*info->itimer)[which].app.interval,
            (*info->itimer)[which].app.value);
        itimer_new_settings(dcontext, which, true/*app*/);
        if (info->shared_itimer)
            release_recursive_lock(info->shared_itimer_lock);
    }
}

void
handle_post_setitimer(dcontext_t *dcontext, bool success,
                      int which, const struct itimerval *new_timer,
                      struct itimerval *prev_timer)
{
    if (new_timer == NULL || which < 0 || which >= NUM_ITIMERS) {
        ASSERT(new_timer == NULL || !success);
        return;
    }
    thread_sig_info_t *info = (thread_sig_info_t *) dcontext->signal_field;
    ASSERT(info != NULL && info->itimer != NULL);
    ASSERT(which >= 0 && which < NUM_ITIMERS);
    if (!success && new_timer != NULL) {
        if (info->shared_itimer)
            acquire_recursive_lock(info->shared_itimer_lock);
        /* restore saved pre-syscall settings */
        (*info->itimer)[which].app = (*info->itimer)[which].app_saved;
        itimer_new_settings(dcontext, which, true/*app*/);
        if (info->shared_itimer)
            release_recursive_lock(info->shared_itimer_lock);
    }
    if (success && prev_timer != NULL)
        handle_post_getitimer(dcontext, success, which, prev_timer);
}

void
handle_post_getitimer(dcontext_t *dcontext, bool success,
                      int which, struct itimerval *cur_timer)
{
    thread_sig_info_t *info = (thread_sig_info_t *) dcontext->signal_field;
    ASSERT(info != NULL && info->itimer != NULL);
    if (success) {
        /* write succeeded for kernel but we're user and can have races */
        struct timeval val;
        DEBUG_DECLARE(bool ok;)
        ASSERT(which >= 0 && which < NUM_ITIMERS);
        ASSERT(cur_timer != NULL);
        if (info->shared_itimer)
            acquire_recursive_lock(info->shared_itimer_lock);
        usec_to_timeval((*info->itimer)[which].app.interval, &val);
        IF_DEBUG(ok = )
            safe_write_ex(&cur_timer->it_interval, sizeof(val), &val, NULL);
        ASSERT(ok);
        if (safe_read(&cur_timer->it_value, sizeof(val), &val)) {
            /* subtract the difference between last-asked-for value
             * and current value to reflect elapsed time
             */
            uint64 left = (*info->itimer)[which].app.value -
                ((*info->itimer)[which].actual.value - timeval_to_usec(&val));
            usec_to_timeval(left, &val);
            IF_DEBUG(ok = )
                safe_write_ex(&cur_timer->it_value, sizeof(val), &val, NULL);
            ASSERT(ok);
        } else
            ASSERT_NOT_REACHED();
        if (info->shared_itimer)
            release_recursive_lock(info->shared_itimer_lock);
    }
}

/* handle app alarm syscall */
/* alarm uses the same itimer and could be defined in terms of setitimer */
void
handle_pre_alarm(dcontext_t *dcontext, unsigned int sec)
{
    struct itimerval val;
    val.it_interval.tv_usec = 0;
    val.it_interval.tv_sec = 0;
    val.it_value.tv_usec = 0;
    val.it_value.tv_sec = sec;
    handle_pre_setitimer(dcontext, ITIMER_REAL, &val, NULL);
}

void
handle_post_alarm(dcontext_t *dcontext, bool success, unsigned int sec)
{
    /* alarm is always successful, so do nothing in post */
    ASSERT(success);
    return;
}

/***************************************************************************/

/* Returns whether to pass on to app */
static bool
handle_suspend_signal(dcontext_t *dcontext, kernel_ucontext_t *ucxt)
{
    os_thread_data_t *ostd = (os_thread_data_t *) dcontext->os_field;
    sigcontext_t *sc = SIGCXT_FROM_UCXT(ucxt);
    kernel_sigset_t prevmask;
    ASSERT(ostd != NULL);

    if (ostd->terminate) {
#ifdef X86
         /* PR 297902: exit this thread, without using any stack */
# ifdef MACOS
        /* We need a stack as 32-bit syscalls take args on the stack.
         * We go ahead and use it for x64 too for simpler sysenter return.
         * We don't have a lot of options: we're terminating, so we go ahead
         * and use the app stack.
         */
        byte *app_xsp = (byte *) get_mcontext(dcontext)->xsp;
# endif
        LOG(THREAD, LOG_ASYNCH, 2, "handle_suspend_signal: exiting\n");
        if (ksynch_kernel_support()) {
            /* can't use stack once set terminated to 1 so in asm we do:
             *   ostd->terminated = 1;
             *   futex_wake_all(&ostd->terminated in xax);
             *   semaphore_signal_all(&ostd->terminated in xax);
             */
# ifdef MACOS
            KSYNCH_TYPE *term = &ostd->terminated;
            ASSERT(sizeof(ostd->terminated.sem) == 4);
# else
            volatile int *term = &ostd->terminated;
# endif
            asm("mov %0, %%"ASM_XAX : : "m"(term));
# ifdef MACOS
            asm("movl $1,4(%"ASM_XAX")");
            asm("mov %0, %%"ASM_XSP : : "m"(app_xsp));
            asm("jmp _dynamorio_semaphore_signal_all");
# else
            asm("movl $1,(%"ASM_XAX")");
            asm("jmp dynamorio_futex_wake_and_exit");
# endif
        } else {
            ksynch_set_value(&ostd->terminated, 1);
# ifdef MACOS
            asm("mov %0, %%"ASM_XSP : : "m"(app_xsp));
            asm("jmp _dynamorio_sys_exit");
# else
            asm("jmp dynamorio_sys_exit");
# endif
        }
        ASSERT_NOT_REACHED();
#elif defined(ARM)
        /* FIXME i#1551: NYI on ARM */
        ASSERT_NOT_IMPLEMENTED(false);
#endif /* X86/ARM */
        return false;
    }

    /* If suspend_count is 0, we are not trying to suspend this thread
     * (os_thread_resume() may have already decremented suspend_count to 0, but
     * os_thread_suspend() will not send a signal until this thread unsets
     * ostd->suspended, so not having a lock around the suspend_count read is
     * ok), so pass signal to app.
     * If we are trying or have already suspended this thread, our own
     * os_thread_suspend() will not send a 2nd suspend signal until we are
     * completely resumed, so we can distinguish app uses of SUSPEND_SIGNAL.  We
     * can't have a race between the read and write of suspended_sigcxt b/c
     * signals are blocked.  It's fine to have a race and reorder the app's
     * signal w/ DR's.
     */
    if (ostd->suspend_count == 0 || ostd->suspended_sigcxt != NULL)
        return true; /* pass to app */

    ostd->suspended_sigcxt = sc;

    /* We're sitting on our sigaltstack w/ all signals blocked.  We're
     * going to stay here but unblock all signals so we don't lose any
     * delivered while we're waiting.  We're at a safe enough point to
     * re-enter master_signal_handler().  We use a mutex in
     * thread_{suspend,resume} to prevent our own re-suspension signal
     * from arriving before we've re-blocked on the resume.
     */
    sigprocmask_syscall(SIG_SETMASK, SIGMASK_FROM_UCXT(ucxt), &prevmask,
                        sizeof(ucxt->uc_sigmask));

    LOG(THREAD, LOG_ASYNCH, 2, "handle_suspend_signal: suspended now\n");
    /* We cannot use mutexes here as we have interrupted DR at an
     * arbitrary point!  Thus we can't use the event_t routines.
     * However, the existing synch and check above prevent any
     * re-entrance here, and our cond vars target just a single thread,
     * so we can get away w/o a mutex.
     */
    /* Notify os_thread_suspend that it can now return, as this thread is
     * officially suspended now and is ready for thread_{get,set}_mcontext.
     */
    ASSERT(ksynch_get_value(&ostd->suspended) == 0);
    ksynch_set_value(&ostd->suspended, 1);
    ksynch_wake_all(&ostd->suspended);
    /* i#96/PR 295561: use futex(2) if available */
    while (ksynch_get_value(&ostd->wakeup) == 0) {
        /* Waits only if the wakeup flag is not set as 1. Return value
         * doesn't matter because the flag will be re-checked.
         */
        ksynch_wait(&ostd->wakeup, 0);
        if (ksynch_get_value(&ostd->wakeup) == 0) {
            /* If it still has to wait, give up the cpu. */
            os_thread_yield();
        }
    }
    LOG(THREAD, LOG_ASYNCH, 2, "handle_suspend_signal: awake now\n");

    /* re-block so our exit from master_signal_handler is not interrupted */
    sigprocmask_syscall(SIG_SETMASK, &prevmask, NULL, sizeof(prevmask));
    ostd->suspended_sigcxt = NULL;

    /* Notify os_thread_resume that it can return now, which (assuming
     * suspend_count is back to 0) means it's then safe to re-suspend.
     */
    ksynch_set_value(&ostd->suspended, 0); /* reset prior to signalling os_thread_resume */
    ksynch_set_value(&ostd->resumed, 1);
    ksynch_wake_all(&ostd->resumed);

    if (ostd->retakeover) {
        ostd->retakeover = false;
        sig_take_over(sc);  /* no return */
        ASSERT_NOT_REACHED();
    }

    return false; /* do not pass to app */
}

/* PR 206278: for try/except we need to save the signal mask */
void
dr_setjmp_sigmask(dr_jmp_buf_t *buf)
{
    /* i#226/PR 492568: we rely on the kernel storing the prior mask in the
     * signal frame, so we do not need to store it on every setjmp, which
     * can be a performance hit.
     */
#ifdef DEBUG
    sigprocmask_syscall(SIG_SETMASK, NULL, &buf->sigmask, sizeof(buf->sigmask));
#endif
}

/* i#61/PR 211530: nudge on Linux.
 * Determines whether this is a nudge signal, and if so queues up a nudge,
 * or is an app signal.  Returns whether to pass the signal on to the app.
 */
static bool
handle_nudge_signal(dcontext_t *dcontext, siginfo_t *siginfo, kernel_ucontext_t *ucxt)
{
    sigcontext_t *sc = SIGCXT_FROM_UCXT(ucxt);
    nudge_arg_t *arg = (nudge_arg_t *) siginfo;
    instr_t instr;
    char buf[MAX_INSTR_LENGTH];

    /* Distinguish a nudge from an app signal.  An app using libc sigqueue()
     * will never have its signal mistaken as libc does not expose the siginfo_t
     * and always passes 0 for si_errno, so we're only worried beyond our
     * si_code check about an app using a raw syscall that is deliberately
     * trying to fool us.
     * While there is a lot of padding space in siginfo_t, the kernel doesn't
     * copy it through on SYS_rt_sigqueueinfo so we don't have room for any
     * dedicated magic numbers.  The client id could function as a magic
     * number for client nudges, but I don't think we want to kill the app
     * if an external nudger types the client id wrong.
     */
    if (siginfo->si_signo != NUDGESIG_SIGNUM
        /* PR 477454: remove the IF_NOT_VMX86 once we have nudge-arg support */
        IF_NOT_VMX86(|| siginfo->si_code != SI_QUEUE
                     || siginfo->si_errno == 0)) {
        return true; /* pass to app */
    }
#if defined(CLIENT_INTERFACE) && !defined(VMX86_SERVER)
    DODEBUG({
        if (TEST(NUDGE_GENERIC(client), arg->nudge_action_mask) &&
            !is_valid_client_id(arg->client_id)) {
            SYSLOG_INTERNAL_WARNING("received client nudge for invalid id=0x%x",
                                    arg->client_id);
        }
    });
#endif
    if (dynamo_exited || !dynamo_initialized || dcontext == NULL) {
        /* Ignore the nudge: too early, or too late.
         * Xref Windows handling of such cases in nudge.c: old case 5702, etc.
         * We do this before the illegal-instr check b/c it's unsafe to decode
         * if too early or too late.
         */
        SYSLOG_INTERNAL_WARNING("too-early or too-late nudge: ignoring");
        return false; /* do not pass to app */
    }

    /* As a further check, try to detect whether this was raised synchronously
     * from a real illegal instr: though si_code for that should not be
     * SI_QUEUE.  It's possible a nudge happened to come at a bad instr before
     * it faulted, or maybe the instr after a syscall or other wait spot is
     * illegal, but we'll live with that risk.
     */
    ASSERT(NUDGESIG_SIGNUM == SIGILL); /* else this check makes no sense */
    instr_init(dcontext, &instr);
    if (safe_read((byte *)sc->SC_XIP, sizeof(buf), buf) &&
        (decode(dcontext, (byte *)buf, &instr) == NULL ||
         /* check for ud2 (xref PR 523161) */
         instr_is_undefined(&instr))) {
        instr_free(dcontext, &instr);
        return true; /* pass to app */
    }
    instr_free(dcontext, &instr);

#ifdef VMX86_SERVER
    /* Treat as a client nudge until we have PR 477454 */
    if (siginfo->si_errno == 0) {
        arg->version = NUDGE_ARG_CURRENT_VERSION;
        arg->flags = 0;
        arg->nudge_action_mask = NUDGE_GENERIC(client);
        arg->client_id = 0;
        arg->client_arg = 0;
    }
#endif

    LOG(THREAD, LOG_ASYNCH, 1,
        "received nudge version=%u flags=0x%x mask=0x%x id=0x%08x arg=0x"
        ZHEX64_FORMAT_STRING"\n",
        arg->version, arg->flags, arg->nudge_action_mask,
        arg->client_id, arg->client_arg);
    SYSLOG_INTERNAL_INFO("received nudge mask=0x%x id=0x%08x arg=0x"ZHEX64_FORMAT_STRING,
                         arg->nudge_action_mask, arg->client_id, arg->client_arg);

    /* We need to handle the nudge at a safe, nolinking spot */
    if (safe_is_in_fcache(dcontext, (byte *)sc->SC_XIP, (byte*)sc->SC_XSP) &&
        dcontext->interrupted_for_nudge == NULL) {
        /* We unlink the interrupted fragment and skip any inlined syscalls to
         * bound the nudge delivery time.  If we already unlinked one we assume
         * that's sufficient.
         */
        fragment_t wrapper;
        fragment_t *f = fragment_pclookup(dcontext, (byte *)sc->SC_XIP, &wrapper);
        if (f != NULL) {
            if (unlink_fragment_for_signal(dcontext, f, (byte *)sc->SC_XIP))
                dcontext->interrupted_for_nudge = f;
        }
    }

    /* No lock is needed since thread-private and this signal is blocked now */
    nudge_add_pending(dcontext, arg);

    return false; /* do not pass to app */
}
